Positive >= 0.05

Neutral between -0.05 and 0.05

Negative <= -0.05

In [16]:
import pandas as pd
reviews_df = pd.read_csv('./data/reviews.csv')

In [17]:
#pandas and numpy for df manipulation
import pandas as pd
import numpy as np
import re
import nltk
import statistics

#Preprocessing: tokenization and lemmatization
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
sent_tokenizer = PunktSentenceTokenizer()

#Sentiment Analysis with VADER
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abdar\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
def sentiment_preprocessor(raw_text, lowercase=True, leave_punctuation = False, lemmatization=True, tokenized_output=True, sentence_output=True):
    

    # Convert to lowercase if specified
    if lowercase:
        clean_text = raw_text.lower()
    else:
        clean_text = raw_text
    
    # Remove newline characters
    clean_text = re.sub(r'(\*|\\n|\\r|\\t|</?ul>|</?li>)', ' ', clean_text)

    # Remove punctuation if specified
    if not leave_punctuation:
        clean_text = re.sub(r'(\W)', ' ', clean_text)

    # Remove URLs
    clean_text = re.sub(r'(http\S+|www\S+)', ' ', clean_text)

    # Remove isolated consonants
    clean_text = re.sub(r'\b([^aeiou\s])\b', ' ', clean_text)

    # Tokenize
    clean_text = word_tokenize(clean_text)

    # Lemmatize if specified
    if lemmatization:
        clean_text = [lemmatizer.lemmatize(token, pos='v') for token in clean_text]

    # Re-join if tokenized output is not requested
    if not tokenized_output:
        clean_text = " ".join(clean_text)
        # Remove space before punctuation
        clean_text = re.sub(r'(\s)(?!\w)', '', clean_text)

    # Join sentences into a single string if specified
    if sentence_output and not tokenized_output:
        clean_text = " ".join(sent_tokenize(clean_text))

    return clean_text

In [19]:
reviews_df['CleanReview'] = reviews_df['Review'].apply(lambda review: sentiment_preprocessor(
    review, lowercase = False, 
    leave_punctuation = True, 
    lemmatization=False, 
    tokenized_output=False))

In [20]:
vader = SentimentIntensityAnalyzer()

In [21]:
# Analyse polarity and add results to dataframe
reviews_df['Vader'] = reviews_df['CleanReview'].apply(lambda x: vader.polarity_scores(x))
reviews_df['Negative_vader'] = reviews_df['Vader'].apply(lambda x: x['neg'])
reviews_df['Neutral_vader'] = reviews_df['Vader'].apply(lambda x: x['neu'])
reviews_df['Positive_vader'] = reviews_df['Vader'].apply(lambda x: x['pos'])
reviews_df['Compound_vader'] = reviews_df['Vader'].apply(lambda x: x['compound'])

In [22]:
# Drop column with polarity scores
reviews_df.drop('Vader', axis=1, inplace=True)

In [23]:
# Name of the columns related with vader
vader_cols = ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader']

In [24]:
reviews_df[vader_cols].describe()

Unnamed: 0,Negative_vader,Neutral_vader,Positive_vader,Compound_vader
count,587.0,587.0,587.0,587.0
mean,0.008245,0.859063,0.131,0.469345
std,0.01445,0.152677,0.141421,0.481055
min,0.0,0.0,0.0,-0.5267
25%,0.0,0.719,0.0,0.0
50%,0.0,0.947,0.0,0.0
75%,0.018,1.0,0.2605,0.96875
max,0.094,1.0,0.677,0.9968


In [25]:
# Classify sentiment based on compound score
reviews_df['Sentiment'] = reviews_df['Compound_vader'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
# Split the data into training and testing sets (you may have a different dataset)
train_data, test_data, train_labels, test_labels = train_test_split(
    reviews_df['CleanReview'], reviews_df['Sentiment'], test_size=0.2, random_state=42
)


In [30]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.sentiment import SentimentIntensityAnalyzer

# Sample data
data = {'Review': ["I love this product! It's amazing.", "Terrible experience, wouldn't recommend.",
                   "Decent product, not bad.", "Average at best.", "Fantastic service! Highly recommend."],
        'Rating': [5, 1, 3, 2, 5]}

df = reviews_df.copy()

# Sentiment analysis using nltk's SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
df['Sentiment'] = df['CleanReview'].apply(lambda x: 'positive' if sia.polarity_scores(x)['compound'] > 0 else 'negative' if sia.polarity_scores(x)['compound'] < 0 else 'neutral')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['CleanReview'], df['Rating'], test_size=0.2, random_state=42)

# Convert text data to numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Predict ratings on the test set
y_pred = classifier.predict(X_test_vectorized)

# Compare sentiment analysis with actual ratings
df_test = pd.DataFrame({'Review': X_test, 'Actual_Rating': y_test, 'Predicted_Rating': y_pred})
df_test['Sentiment'] = df_test['CleanReview'].apply(lambda x: 'positive' if sia.polarity_scores(x)['compound'] > 0 else 'negative' if sia.polarity_scores(x)['compound'] < 0 else 'neutral')

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nComparison of Sentiment Analysis and Actual Ratings on Test Set:\n", df_test[['CleanReview', 'Actual_Rating', 'Sentiment', 'Predicted_Rating']])


ValueError: Unknown label type: (array([5. , 3.4, 3.4, 3.8, 3.7, 4.8, 4.8, 4.4, 3.1, 3.9, 4.9, 4.3, 3.7,
       4.5, 4.5, 4.7, 4.3, 3.4, 3.8, 4.4, 3.6, 4.6, 3.5, 4.8, 3.1, 3.5,
       2.4, 3. , 4.9, 4.8, 3.5, 2.9, 3.3, 5. , 3.2, 2.7, 2.2, 4.2, 4.7,
       3.1, 4.2, 4.5, 4.9, 3.5, 3.9, 3.8, 5. , 3.1, 2.7, 3.2, 4.7, 2.4,
       3.4, 4.7, 4.8, 1.3, 4.4, 3.9, 2.2, 2.2, 3.8, 3.6, 4.8, 2.7, 3.4,
       4.1, 4.2, 4.7, 4.9, 4.6, 1.1, 3.2, 2.9, 3.9, 3.4, 4.3, 4.5, 3.6,
       4. , 4.5, 3.7, 3.3, 3.6, 4.1, 2.7, 4. , 3.2, 4.3, 4.7, 3.7, 3.9,
       3.6, 4.8, 3.2, 3.5, 4.2, 4.4, 2.6, 2.7, 2.9, 4.9, 4.7, 3.9, 5. ,
       4.5, 4.9, 4.7, 4.8, 4.4, 1.4, 4.3, 3.6, 3.9, 3.2, 2.5, 4.6, 3.8,
       4.1, 3.6, 3.4, 4.9, 4.2, 4.3, 4. , 4.4, 3. , 3. , 3.1, 4.4, 4.2,
       4.6, 4.4, 4.5, 4.9, 4.6, 3.1, 3.4, 4.4, 4.5, 2.3, 3.3, 5. , 3.8,
       4.3, 4.5, 4.1, 4.6, 4.6, 4.1, 3.3, 3.3, 3.9, 3.9, 0.1, 3.2, 3.6,
       3.4, 4.2, 4.4, 4.9, 2.7, 3.8, 3.5, 4.9, 3.5, 4.9, 4.7, 4.5, 4.2,
       2.4, 3.9, 3.4, 1.8, 4.5, 4.9, 4.5, 3.9, 5. , 3.8, 4.6, 3.1, 4.8,
       4.1, 3.8, 3.2, 3.9, 3.4, 4.7, 5. , 4.7, 4.3, 3.3, 4.5, 4.6, 4.2,
       2.6, 4.5, 2.2, 3.2, 4.2, 3.5, 4.3, 3.2, 4.2, 3.1, 3.4, 3.7, 4.2,
       3.5, 3.9, 4.8, 4.7, 5. , 3.9, 2.7, 4.7, 3.9, 4.5, 3.8, 3.2, 4.9,
       3.4, 4.4, 2.5, 4.5, 2.7, 4.5, 3.5, 4.3, 3.8, 4.4, 3.8, 4.6, 3.5,
       4.9, 1.9, 4.5, 4.1, 4.5, 3.4, 3.4, 3.2, 4.3, 4.5, 1.1, 5. , 4.3,
       4.1, 2.5, 2.2, 3.5, 4.9, 4.8, 3.7, 4.3, 3.4, 4.2, 4.7, 3.8, 2.8,
       3.2, 4.2, 3. , 4.3, 3.5, 2.8, 3.5, 4.5, 4.9, 4.6, 4.9, 3.5, 4.4,
       3.2, 3.1, 3.8, 4.4, 4.3, 4.3, 2.2, 3.9, 4.6, 4.8, 4.8, 4.4, 3.5,
       2.4, 3.1, 4.5, 3.4, 4.3, 3.4, 3.3, 4.7, 4.1, 2.5, 3.9, 4. , 5. ,
       4.3, 3.5, 3.4, 4.9, 4.6, 2.3, 3.2, 3.9, 2.3, 3.6, 4.7, 4.7, 3.4,
       3.9, 4.4, 0.4, 4. , 3.9, 4.4, 4.8, 4.8, 3.4, 4.4, 4.7, 3.5, 3.2,
       4.6, 2.6, 4.5, 4.7, 3.8, 2.7, 3.5, 4.2, 4.7, 4.7, 4.5, 4.3, 2.4,
       4.2, 4.6, 3.4, 4.3, 4.2, 4.8, 3.8, 4.6, 4.4, 3.5, 4.6, 4.4, 4.9,
       2.4, 3.9, 3.6, 3.1, 5. , 4.5, 2.2, 3.4, 0.8, 3.2, 3.7, 4.5, 4.4,
       4.6, 4.8, 2.6, 3.6, 2.2, 4.9, 2.8, 4.2, 3.8, 3.9, 3.9, 4.8, 2.5,
       4.9, 3.4, 2.3, 4.2, 4.5, 4. , 4.4, 4.5, 3.8, 4.9, 3.2, 3.9, 4.3,
       2.9, 4.8, 4.5, 3.2, 3.9, 1.6, 2.9, 3.4, 4.4, 2.8, 4.2, 4.2, 4.3,
       4.4, 5. , 4.5, 4.9, 3.3, 4.4, 4.7, 4.6, 3.7, 4.8, 4.3, 4.2, 1.9,
       4.9, 3.4, 4.3, 3.4, 3.3, 3.9, 4.8, 4.6, 4.3, 4.1, 4.7, 3.6, 4.2,
       3.6, 3.2, 3.2, 4.5, 4.2, 4.4, 3.5, 2.4, 3.9, 3.4, 3.8, 4.4, 3.8,
       3.5, 4.9, 2.7, 4. , 0.8, 3.8, 3.3, 4.1, 4.5, 3.2, 3.1, 3.6, 2.2,
       5. , 3.8, 4.1, 4.5, 4.9, 3.9, 3.5, 3.8, 2.4, 2.6, 3.8, 5. , 3.8,
       3.6]),)