In [21]:
import numpy as np
import pandas as pd
import json
import gzip
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize


# 'C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Software_5.json'
#C:\Users\emovi\Desktop\VibeCaster\VibeCaster\data\Software_5.json(1).gz

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emovi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
## load the datset into DataFrame as described on the website: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/#subsets
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')
# small dataset, ca. 77 000 Entries
# df = getDF('C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Industrial_and_Scientific_5.json.gz')
# way bigger dataset ca. 500 000 Entries
df = getDF('C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Video_Games_5.json.gz')


In [23]:
 # summary statistics
print(df.shape) # 
print(df.info())
print(df.describe())

(497577, 12)
<class 'pandas.core.frame.DataFrame'>
Index: 497577 entries, 0 to 497576
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         497577 non-null  float64
 1   verified        497577 non-null  bool   
 2   reviewTime      497577 non-null  object 
 3   reviewerID      497577 non-null  object 
 4   asin            497577 non-null  object 
 5   reviewerName    497501 non-null  object 
 6   reviewText      497419 non-null  object 
 7   summary         497468 non-null  object 
 8   unixReviewTime  497577 non-null  int64  
 9   vote            107793 non-null  object 
 10  style           289237 non-null  object 
 11  image           3634 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 46.0+ MB
None
             overall  unixReviewTime
count  497577.000000    4.975770e+05
mean        4.220456    1.367848e+09
std         1.185424    1.224113e+08
min         

In [24]:
print(df.isna().sum())

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
reviewerName          76
reviewText           158
summary              109
unixReviewTime         0
vote              389784
style             208340
image             493943
dtype: int64


In [25]:
# delete entrys with missing reviewText: 
df.dropna(subset=['reviewText'], inplace=True)

In [26]:
# only keep overall and reviewText
all_columns = df.columns.tolist()

columns_to_keep = ['overall', 'reviewText']

columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

df.drop(columns=columns_to_drop, inplace=True)

In [27]:
print(df.describe())

             overall
count  497419.000000
mean        4.220297
std         1.185491
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000


In [28]:
# create new col sentiment to train the model on

def classify_sentiment(overall_score):
    if 1 <= overall_score <= 2:
        return 1  # Negative
    elif overall_score == 3:
        return 0  # Neutral
    elif 4 <= overall_score <= 5:
        return 2  # Positive

# Apply the function to the 'overall' column to create the 'sentiment' column
df['sentiment'] = df['overall'].apply(classify_sentiment)

In [29]:
print(df.head())

# Count the number of each sentiment
print(df['sentiment'].value_counts())


   overall                                         reviewText  sentiment
0      5.0  This game is a bit hard to get the hang of, bu...          2
1      4.0  I played it a while but it was alright. The st...          2
2      3.0                                           ok game.          0
3      2.0  found the game a bit too complicated, not what...          1
4      5.0  great game, I love it and have played it since...          2
sentiment
2    393267
1     55012
0     49140
Name: count, dtype: int64


In [30]:
## dataset is heavily skewed on the positive side of things so might need to use cross-validation to account for it
# we only need reviewText and Sentiment to start training our model so lets drop the 'overall' column
df.drop(columns="overall", inplace=True)


In [31]:
# Specific Preprocessing Steps:

#     Text Cleaning:
#         Why: Raw text often contains punctuations, numbers, and special characters that don't contribute much to the sentiment.
#         How: Regular expressions or string manipulation techniques can be used for this.

#     Tokenization:
#         Why: Tokenization helps to break down the text into smaller pieces, often into words, which makes it easier for the algorithm to identify patterns.
#         How: Libraries like NLTK and spaCy provide tokenization methods.

#     Stemming/Lemmatization:
#         Why: Different forms of a word often convey the same sentiment (e.g., 'running' and 'ran'). Stemming and Lemmatization convert words to their base or root form.
#         How: Again, NLTK and spaCy have methods for these.

#     Removal of Stop Words:
#         Why: Commonly occurring words (like 'and', 'the', 'is') generally don't contribute to the sentiment and can be removed.
#         How: Predefined lists of stop words are available in NLTK and spaCy.

#     Feature Extraction:
#         Why: Machine learning algorithms require numerical input, and the text needs to be converted into a format like Bag-of-Words or TF-IDF that can be fed into these algorithms.
#         How: Scikit-learn provides CountVectorizer for Bag-of-Words and TfidfVectorizer for TF-IDF.

#     Handling Class Imbalance:
#         Why: Given that your dataset is imbalanced, using techniques to either oversample the minority class or undersample the majority class can make the model more fair.
#         How: Libraries like imblearn provide methods like SMOTE for oversampling.



#### Data Preprocessing####
## Step1: Text Cleaning ## 
# Remove punctuations
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub('[^\w\s]', '', x))

## Step 2: Tokenization ##
df['tokenized_reviewText'] = df['reviewText'].apply(word_tokenize)

In [32]:
# ## Step3: Lemmatization ## 
# from nltk.corpus import wordnet
# nltk.download('averaged_perceptron_tagger')


# def get_wordnet_pos(tag):
#     """Map POS tag to first character used by WordNetLemmatizer"""
#     tag = tag[0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)

# # Function to apply lemmatization to a list of words with POS tagging
# def lemmatize_with_pos(words):
#     pos_tagged = nltk.pos_tag(words)
#     return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tagged]

# # Apply POS tagging and lemmatization
# df['lemmatized_reviewText'] = df['tokenized_reviewText'].apply(lemmatize_with_pos)

## Step3: Lemmatization ##
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer  # Import the WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Function to apply lemmatization to a list of words with POS tagging
def lemmatize_with_pos(words):
    pos_tagged = nltk.pos_tag(words)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tagged]

# Apply POS tagging and lemmatization
df['lemmatized_reviewText'] = df['tokenized_reviewText'].apply(lemmatize_with_pos)

## use concurrency to speed up lemmatization ## 
# from concurrent.futures import ProcessPoolExecutor
# import numpy as np

# # Initialize the WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()

# def get_wordnet_pos(tag):
#     """Map POS tag to first character used by WordNetLemmatizer"""
#     tag = tag[0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)

# def lemmatize_with_pos(words):
#     pos_tagged = nltk.pos_tag(words)
#     return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tagged]

# def parallel_lemmatization(df_split):
#     return df_split['tokenized_reviewText'].apply(lemmatize_with_pos)

# # Split the DataFrame into smaller parts for parallel processing
# num_splits = 8  # Number of splits
# df_splits = np.array_split(df, num_splits)

# # Use ProcessPoolExecutor to parallelize the function
# with ProcessPoolExecutor() as executor:
#     df_splits_lemmatized = list(executor.map(parallel_lemmatization, df_splits))

# # Concatenate the splits back into a single DataFrame
# df_lemmatized = pd.concat(df_splits_lemmatized)
# df['lemmatized_reviewText'] = df_lemmatized



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\emovi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


KeyboardInterrupt: 

In [None]:
## Step4: Removal of Stop Words to reduce dimensionality and complexity ##
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(words):
    return [word for word in words if word.lower() not in stop_words]

df['no_stopwords_reviewText'] = df['lemmatized_reviewText'].apply(remove_stopwords)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emovi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# cleaning up the DataFrame 
df = df[['no_stopwords_reviewText', 'sentiment']]
df.rename(columns={'no_stopwords_reviewText': 'reviewText'}, inplace=True)

In [None]:
## Step5: Feature Extraction ##
# Using Term Frequency-Inverse Document Frequency (TF-IDF) to get numerical representation of the text data# 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Fit and transform the data
df['reviewText'] = df['reviewText'].apply(' '.join)
X = vectorizer.fit_transform(df['reviewText'])
# Now, X is a sparse matrix containing the TF-IDF features

import joblib
# Save the TF-IDF vectorizer to disk
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
## Step6: Handling Class Imbalance##
# Using Synthetic Minority Over-sampling Technique (SMOTE) to generate synthetic data of minority class instances (neutral and negative sentiments) to mitigate class imbalance
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# Fit on data
X_resampled, y_resampled = smote.fit_resample(X, df['sentiment'])


In [None]:
## Choosing model and trying it out ##

# First lets try using Naive Bayes # 
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# # Splitting the dataset into Training set and Test set
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# # Initialize the Multinomial Naive Bayes classifier
# nb_classifier = MultinomialNB()

# # Fit the model
# nb_classifier.fit(X_train, y_train)

# # Predicting the Test set results
# y_pred = nb_classifier.predict(X_test)

# # Calculating various metrics
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')
# f1 = f1_score(y_test, y_pred, average='weighted')

# print(f'Accuracy: {accuracy}')
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')
# print(f'F1 Score: {f1}')



In [None]:
# # Hyperparam tuning: 
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}

# # Initialize a Multinomial Naive Bayes classifier
# nb_classifier = MultinomialNB()

# # Initialize the GridSearchCV
# grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, 
#                            cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# # Fit data to GridSearch
# grid_search.fit(X_train, y_train)

# # Getting the best parameters
# best_parameters = grid_search.best_params_
# best_score = grid_search.best_score_

# print(f'Best Parameters: {best_parameters}')
# print(f'Best Score: {best_score}')

In [None]:
# # retraining the model with found hyperparam: 
# # Initialize the Multinomial Naive Bayes classifier with the best parameter
# nb_classifier = MultinomialNB(alpha=0.001)

# nb_classifier.fit(X_train, y_train)

# # Predict and evaluate
# y_pred = nb_classifier.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')
# f1 = f1_score(y_test, y_pred, average='weighted')

# print(f'Accuracy: {accuracy}')
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')
# print(f'F1 Score: {f1}')

In [None]:
# Performance gain of approximately 5% after Fine-Tuning

## Now lets try using Logistic regression to see if performance will increase (keeping in mind that overfitting is a real possibility)  ##

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Initialize the Logistic Regression model
log_reg = LogisticRegression(solver='saga', max_iter=10000) # saga = Stochastic Average Gradient Descent Algorithm 

# Fit the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions on the test data
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')



Accuracy: 0.7865367581930912
Precision: 0.7866504314855324
Recall: 0.7865367581930912
F1 Score: 0.7864313465960591


In [None]:
## Hyperparameter Finetuning to (maybe) get better results ##

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit data to GridSearch
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_parameters}')
print(f'Best Score: {best_score}')

# Retrain the model with best parameters
log_reg_best = LogisticRegression(C=best_parameters['C'], penalty=best_parameters['penalty'], solver='saga')
log_reg_best.fit(X_train, y_train)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
# C of 100 means High variance, low bias
# Make predictions on the test data
y_pred = log_reg_best.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9108037437990274
Precision: 0.9123433504215814
Recall: 0.9108037437990274
F1 Score: 0.9101739200299686


In [None]:
## Performance gain of ≈ 7% after Fine Tuning
# export the trained model to use in web interface
import joblib

# Save the fine-tuned model to disk
joblib.dump(log_reg_best, 'fine_tuned_sentiment_model.pkl')

['fine_tuned_sentiment_model.pkl']