In [3]:
#Tokenization
import nltk #text processing library for human language data.
example = '''He is a great teacher that cares about his students. 
He is aware that we have a lot of work to do so he has always been flexible and considered with us.
I would like to meet more teachers like him.'''

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize
sents = sent_tokenize(example) #tokenizing text into sentences
sents

['He is a great teacher that cares about his students.',
 'He is aware that we have a lot of work to do so he has always been flexible and considered with us.',
 'I would like to meet more teachers like him.']

In [5]:
len(sents)

3

In [6]:
word_tokens = [word_tokenize(sentence) for sentence in sents]
# Print the word tokens for each sentence
for sentence_tokens in word_tokens:
    print(sentence_tokens)
print("---------------------------------------")
# Function to perform lowercasing and punctuation removal
def lowercasing_punc_removal(tokens):
    processed_tokens = [token.lower().strip('.!?,') for token in tokens]
    return processed_tokens
    
for sentence_tokens in word_tokens:
    print(lowercasing_punc_removal(sentence_tokens))

['He', 'is', 'a', 'great', 'teacher', 'that', 'cares', 'about', 'his', 'students', '.']
['He', 'is', 'aware', 'that', 'we', 'have', 'a', 'lot', 'of', 'work', 'to', 'do', 'so', 'he', 'has', 'always', 'been', 'flexible', 'and', 'considered', 'with', 'us', '.']
['I', 'would', 'like', 'to', 'meet', 'more', 'teachers', 'like', 'him', '.']
---------------------------------------
['he', 'is', 'a', 'great', 'teacher', 'that', 'cares', 'about', 'his', 'students', '']
['he', 'is', 'aware', 'that', 'we', 'have', 'a', 'lot', 'of', 'work', 'to', 'do', 'so', 'he', 'has', 'always', 'been', 'flexible', 'and', 'considered', 'with', 'us', '']
['i', 'would', 'like', 'to', 'meet', 'more', 'teachers', 'like', 'him', '']


In [7]:
#StopWords removal
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [8]:
# Remove stop words from the word tokens
filtered_word_tokens = [[word for word in sentence_tokens if word.lower() not in sw] for sentence_tokens in word_tokens]

# Print the filtered word tokens for each sentence
for sentence_tokens in filtered_word_tokens:
    print(sentence_tokens)

['great', 'teacher', 'cares', 'students', '.']
['aware', 'lot', 'work', 'always', 'flexible', 'considered', 'us', '.']
['would', 'like', 'meet', 'teachers', 'like', '.']


In [9]:
#Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_word_tokens = [
    [stemmer.stem(word.lower()) for word in sentence_tokens if word.lower() not in sw]
    for sentence_tokens in word_tokens
]
stemmed_word_tokens

[['great', 'teacher', 'care', 'student', '.'],
 ['awar', 'lot', 'work', 'alway', 'flexibl', 'consid', 'us', '.'],
 ['would', 'like', 'meet', 'teacher', 'like', '.']]

In [10]:
#let's convert text into bag of words
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Convert the list of lists to a list of sentences
sentences = [' '.join(word_list) for word_list in stemmed_word_tokens]
# Print the resulting list of sentences
print(sentences)


['great teacher care student .', 'awar lot work alway flexibl consid us .', 'would like meet teacher like .']


In [11]:
#let's convert text into bag of words
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
# Initializing CountVectorizer
cv = CountVectorizer()
# Convert the processed text (stemmed corpus) into numerical features
vc = cv.fit_transform(sentences) # fit means creating a fixed vocabulary
# transform means converting text into numerical vector

print(vc.toarray()) # printing numerical matrix
print(cv.vocabulary_) # printing the vocabulary (dictionary of words and their corresponding indices)
# In our BoW, the word 'great' is placed at index 5 and it occurred only once in our example.

print(len(cv.vocabulary_)) # In our BoW, we have 14 words


[[0 0 1 0 0 1 0 0 0 1 1 0 0 0]
 [1 1 0 1 1 0 0 1 0 0 0 1 1 0]
 [0 0 0 0 0 0 2 0 1 0 1 0 0 1]]
{'great': 5, 'teacher': 10, 'care': 2, 'student': 9, 'awar': 1, 'lot': 7, 'work': 12, 'alway': 0, 'flexibl': 4, 'consid': 3, 'us': 11, 'would': 13, 'like': 6, 'meet': 8}
14


In [12]:
#Now let's make sentiment analysis with a help of lexicon based approach Vader dictionary
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Perform sentiment analysis on each sentence
for sentence in sentences:
    sentiment_scores = analyzer.polarity_scores(sentence)
    compound_score = sentiment_scores['compound']

    # Determine sentiment label based on the compound score
    if compound_score >= 0.05:
        sentiment_label = "1"
    elif compound_score <= -0.05:
        sentiment_label = "-1"
    else:
        sentiment_label = "0"

    # Print the sentence, sentiment scores, and sentiment label
    print("Sentence:", sentence)
    print("Sentiment Scores:", sentiment_scores)
    print("Sentiment Label:", sentiment_label)
    print("---------------------")

Sentence: great teacher care student .
Sentiment Scores: {'neg': 0.0, 'neu': 0.215, 'pos': 0.785, 'compound': 0.8074}
Sentiment Label: 1
---------------------
Sentence: awar lot work alway flexibl consid us .
Sentiment Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Sentiment Label: 0
---------------------
Sentence: would like meet teacher like .
Sentiment Scores: {'neg': 0.0, 'neu': 0.375, 'pos': 0.625, 'compound': 0.6124}
Sentiment Label: 1
---------------------


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Arailym\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
# Sentiment analysis with a help of Vader dictionary
import pandas as pd # pandas library -> efficiently store and manipulate CSV files.
dataset = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
print(dataset.head(10))

                                             Answers  Classification
0                                            Nothing             NaN
1                    Maybe less complicated the test             NaN
2                                            Nothing             NaN
3  In order to make it easier to follow the cours...             NaN
4  I really enjoyed the theoretical part of the l...             NaN
5  It would be interesting to meet stakeholders w...             NaN
6  I have no suggestions, I think this was the be...             NaN
7  She is a very nice teacher that wants her stud...             NaN
8  He is a great teacher that cares about his stu...             NaN
9  He is a very nice teacher. He made the subject...             NaN


In [14]:
# Sentiment analysis with a help of Vader dictionary
import pandas as pd # pandas library -> efficiently store and manipulate CSV files.
dataset = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
print(dataset.head(10))
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

# Preprocessing function 
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming using Porter Stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text
dataset = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
dataset['preprocessed_text'] = dataset['Answers'].apply(preprocess_text)

# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment label based on the compound score
def get_sentiment_label(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
        
# Analyze the sentiment of each preprocessed text using VADER
dataset['vader_sentiment_score'] = dataset['preprocessed_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
dataset['vader_sentiment_label'] = dataset['vader_sentiment_score'].apply(get_sentiment_label)

# Save the results into a new DataFrame
result_data = dataset[['Answers', 'vader_sentiment_label', 'vader_sentiment_score']]

# Convert the new DataFrame into a CSV file
result_data.to_csv('vader_dictionary.csv', index=False)
print("VADER predicted sentiments saved to 'vader_dictionary.csv'.")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Arailym\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


VADER predicted sentiments saved to 'vader_dictionary.csv'.


In [14]:
# Sentiment analysis with a help of Vader dictionary
import pandas as pd # pandas library -> efficiently store and manipulate CSV files.
dataset = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
print(dataset.head(10))
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

# Preprocessing function 
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming using Porter Stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text
dataset = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
dataset['preprocessed_text'] = dataset['Answers'].apply(preprocess_text)

# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment label based on the compound score
def get_sentiment_label(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
        
# Analyze the sentiment of each preprocessed text using VADER
dataset['vader_sentiment_score'] = dataset['preprocessed_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
dataset['vader_sentiment_label'] = dataset['vader_sentiment_score'].apply(get_sentiment_label)

# Save the results into a new DataFrame
result_data = dataset[['Answers', 'vader_sentiment_label', 'vader_sentiment_score']]

# Convert the new DataFrame into a CSV file
result_data.to_csv('vader_dictionary.csv', index=False)
print("VADER predicted sentiments saved to 'vader_dictionary.csv'.")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Arailym\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


VADER predicted sentiments saved to 'vader_dictionary.csv'.


In [63]:
# Sentiment analysis with a help of Naive Bayes algorithm
import pandas as pd # pandas library -> efficiently store and manipulate CSV files.
test_data = pd.read_csv('C:/Users/Arailym/Documents/Research paper/testing_data.csv')
train_data = pd.read_csv('C:/Users/Arailym/Documents/Research paper/train_data.csv')

print(train_data.head(10))
print(test_data.columns)

   Classification                                            Answers
0               0      Longer exams, the were not a lot of questions
1               1  Business communication was my favorite subject...
2              -1  Boring class, maybe du to the fact that it was...
3               0                                     no suggestions
4               1                                    everything good
5              -1                                      less homework
6              -1   Focus it either for piloting or for engineering.
7               0                                                ---
8               0                                     no suggestions
9              -1                                     more exercises
Index(['Classification', 'Answers'], dtype='object')


In [64]:
# Apply preprocessing to the 'Answers' column in the DataFrame
train_data['preprocessed_text'] = train_data['Answers'].apply(preprocess_text)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# Fit and transform the preprocessed text to create the feature matrix
X_train_features = vectorizer.fit_transform(train_data['preprocessed_text'])
print("Feature Matrix:")
print(X_train_features.toarray())

Feature Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [65]:
from sklearn.naive_bayes import MultinomialNB
# Initialize the Naive Bayes classifier (Multinomial Naive Bayes)
nb_classifier = MultinomialNB()

# Train the classifier using the feature matrix and sentiment labels
nb_classifier.fit(X_train_features, train_data['Classification'])

# Print a message to indicate that the training is complete
print("Naive Bayes classifier trained successfully!")

Naive Bayes classifier trained successfully!


In [66]:
# Apply preprocessing to the 'Answer' column in the DataFrame
test_data['preprocessed_text'] = test_data['Answers'].apply(preprocess_text)

# Display the first few rows of the preprocessed testing data
print(test_data.head())
print(test_data.columns)

   Classification                                            Answers  \
0               0                                            nothing   
1               1  The teacher listened to Erasmus students and w...   
2               0  the only probelm sometimes was the connection ...   
3              -1  I would have appreciate the teacher to do powe...   
4              -1            it was hard to contact with the teacher   

                                   preprocessed_text  
0                                               noth  
1  teacher listen erasmu student welcom teacher i...  
2  probelm sometim connect internet 's someth som...  
3  would appreci teacher powerpoint present lectu...  
4                               hard contact teacher  
Index(['Classification', 'Answers', 'preprocessed_text'], dtype='object')


In [67]:
# Transform the preprocessed testing data using the same vectorizer
X_test_data_features = vectorizer.transform(test_data['preprocessed_text'])

# Display the feature matrix (sparse matrix representation) of the testing data
print("Feature Matrix for Testing Data:")
print(X_test_data_features.toarray())


Feature Matrix for Testing Data:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [68]:
# Use the trained classifier to make predictions on the testing data
predictions = nb_classifier.predict(X_test_data_features)

# Combine the predicted sentiment labels with the original testing data
test_data['predicted_sentiment'] = predictions

# Save the combined data into a new DataFrame
result_data = test_data[['Answers', 'predicted_sentiment']]

# Convert the new DataFrame into a CSV file
result_data.to_csv('naibe_bayes_rez.csv', index=False)

print("Predicted sentiments saved to 'naive_bayes_sentiment_prediction.csv'.")

Predicted sentiments saved to 'naive_bayes_sentiment_prediction.csv'.


In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
actual_labels = test_data['Classification']

# Load the predicted labels from the CSV file generated by the Naive Bayes classifier
predicted_data = pd.read_csv('C:/Users/Arailym/naibe_bayes_rez.csv')
predicted_labels = predicted_data['predicted_sentiment']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)


# Calculate precision and recall
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)


ValueError: Found input variables with inconsistent numbers of samples: [224, 44]

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

example = '''He is a great teacher that cares about his students. 
He is aware that we have a lot of work to do so he has always been flexible and considered with us. 
I would like to meet more teachers like him.'''

train_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
# Apply preprocessing to the 'Answers' column in the training DataFrame
train_data['preprocessed_text'] = train_data['Answers'].apply(preprocess_text)

preprocessed_example = preprocess_text(example) 
# Creating a DataFrame with the preprocessed example text ensures 
# that the input data format is compatible with the trained Naive Bayes classifier
example_data = pd.DataFrame({'Answers': [preprocessed_example]})

# Initialize the CountVectorizer and fit_transform the preprocessed text
vectorizer = CountVectorizer()
X_train_features = vectorizer.fit_transform(train_data['preprocessed_text'])

# Initialize the Naive Bayes classifier (Multinomial Naive Bayes)
nb_classifier = MultinomialNB()
# Train the classifier using the feature matrix and sentiment labels
nb_classifier.fit(X_train_features, train_data['Classification'])
# Transform the example data using the same vectorizer
X_example_features = vectorizer.transform(example_data['Answers'])

# Use the trained classifier to make predictions on the example data
example_prediction_prob = nb_classifier.predict_proba(X_example_features)
predicted_class_index = example_prediction_prob.argmax()  # Get the index of the predicted class
predicted_sentiment_label = nb_classifier.classes_[predicted_class_index]

# Get the confidence (probability) of the prediction
confidence = example_prediction_prob[0][predicted_class_index]

# Print the predicted sentiment and confidence for the example text
print("Predicted Sentiment for the text is :", predicted_sentiment_label)
print("Confidence:", confidence)



NameError: name 'preprocess_text' is not defined

In [None]:
pip install matplotlib

In [4]:
import string
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt

nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming using Porter Stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Load the testing data from CSV
test_data = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
# Load the labeled data from CSV
labeled_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')

# Preprocess the text in the testing data
test_data['preprocessed_text'] = test_data['Answers'].apply(preprocess_text)
# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Arailym\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Arailym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arailym\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:

# Function to get sentiment label based on the compound score and threshold
def get_sentiment_label(compound_score, threshold):
    if compound_score >= threshold:
        return 'Positive'
    elif compound_score <= -threshold:
        return 'Negative'
    else:
        return 'Neutral'

In [6]:

# Test thresholds from 0.01 to 0.1
thresholds = [i/100 for i in range(1, 11)]

In [7]:
# List to store the results
results = []

In [8]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
actual_labels = test_data['Classification']

# Load the predicted labels from the CSV file generated by the Naive Bayes classifier
predicted_data = pd.read_csv('C:/Users/Arailym/nb_rez.csv')
predicted_labels = predicted_data['predicted_sentiment']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)

Accuracy: 0.8839285714285714


In [9]:
import pandas as pd

# Load the labeled data into a DataFrame (assuming it is already available)
monkeylearn = pd.read_csv('C:/Users/Arailym/Downloads/processed_batch.csv')

# Mapping dictionary for sentiment labels
sentiment_map = {'Positive': 1, 'Negative': -1, 'Neutral': 0}

# Convert the 'Classification' column to numerical values using the mapping
monkeylearn['sentiment_numeric'] = monkeylearn['Classification'].map(sentiment_map)

# Save the DataFrame with the converted sentiment labels to a CSV file
monkeylearn.to_csv('monkey.csv', index=False)

print("DataFrame with numerical sentiment labels saved to 'labeled_feedback_numeric.csv'.")


DataFrame with numerical sentiment labels saved to 'labeled_feedback_numeric.csv'.


In [10]:
# Sentiment analysis with a help of Vader dictionary
import pandas as pd # pandas library -> efficiently store and manipulate CSV files.
dataset = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
print(dataset.head(10))
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
nltk.download('vader_lexicon')

# Preprocessing function 
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming using Porter Stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text
dataset = pd.read_csv('C:/Users/Arailym/Downloads/exchange_students_feedback.csv')
dataset['preprocessed_text'] = dataset['Answers'].apply(preprocess_text)

# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment label based on the compound score
def get_sentiment_label(compound_score):
    if compound_score >= 0.05:
        return '1'
    elif compound_score <= -0.05:
        return '-1'
    else:
        return '0'
        
# Analyze the sentiment of each preprocessed text using VADER
dataset['vader_sentiment_score'] = dataset['preprocessed_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
dataset['vader_sentiment_label'] = dataset['vader_sentiment_score'].apply(get_sentiment_label)

# Save the results into a new DataFrame
result_data = dataset[['Answers', 'vader_sentiment_label', 'vader_sentiment_score']]

# Convert the new DataFrame into a CSV file
result_data.to_csv('zero_treshold.csv', index=False)
print("VADER predicted sentiments saved to 'one_treshold.csv'.")


                                             Answers  Classification
0                                            Nothing             NaN
1                    Maybe less complicated the test             NaN
2                                            Nothing             NaN
3  In order to make it easier to follow the cours...             NaN
4  I really enjoyed the theoretical part of the l...             NaN
5  It would be interesting to meet stakeholders w...             NaN
6  I have no suggestions, I think this was the be...             NaN
7  She is a very nice teacher that wants her stud...             NaN
8  He is a great teacher that cares about his stu...             NaN
9  He is a very nice teacher. He made the subject...             NaN


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Arailym\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


VADER predicted sentiments saved to 'one_treshold.csv'.


In [20]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
actual_labels = test_data['Classification']

# Load the predicted labels from the CSV file generated by the Naive Bayes classifier
predicted_data = pd.read_csv('C:/Users/Arailym/one_treshold.csv')
predicted_labels = predicted_data['vader_sentiment_label']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)


# Calculate precision and recall
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6160714285714286
Precision: 0.737417924829275
Recall: 0.6160714285714286


In [21]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
actual_labels = test_data['Classification']

# Load the predicted labels from the CSV file generated by the Naive Bayes classifier
predicted_data = pd.read_csv('C:/Users/Arailym/two_treshold.csv')
predicted_labels = predicted_data['vader_sentiment_label']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)


# Calculate precision and recall
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6160714285714286
Precision: 0.737417924829275
Recall: 0.6160714285714286


In [22]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
actual_labels = test_data['Classification']

# Load the predicted labels from the CSV file generated by the Naive Bayes classifier
predicted_data = pd.read_csv('C:/Users/Arailym/three_treshold.csv')
predicted_labels = predicted_data['vader_sentiment_label']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)


# Calculate precision and recall
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6116071428571429
Precision: 0.7350653065460239
Recall: 0.6116071428571429


In [24]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
actual_labels = test_data['Classification']

# Load the predicted labels from the CSV file generated by the Naive Bayes classifier
predicted_data = pd.read_csv('C:/Users/Arailym/five_treshold.csv')
predicted_labels = predicted_data['vader_sentiment_label']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)


# Calculate precision and recall
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6116071428571429
Precision: 0.7350653065460239
Recall: 0.6116071428571429


In [29]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/Downloads/labeled_feedback.csv')
actual_labels = test_data['Classification']

# Load the predicted labels from the CSV file generated by the Naive Bayes classifier
predicted_data = pd.read_csv('C:/Users/Arailym/monkey.csv')
predicted_labels = predicted_data['sentiment_numeric']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)


# Calculate precision and recall
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.6383928571428571
Precision: 0.6526430320111916
Recall: 0.6383928571428571


In [32]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score

# Load the ground truth labels for the testing data
test_data = pd.read_csv('C:/Users/Arailym/naibe_bayes_rez.csv')
actual_labels = test_data['labeled_sentiment']

predicted_labels = test_data['predicted_sentiment']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print("Accuracy:", accuracy)


# Calculate precision and recall
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.7272727272727273
Precision: 0.738755980861244
Recall: 0.7272727272727273
