In [67]:
import numpy as np
import pandas as pd
import json
import gzip
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize


# 'C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Software_5.json'
#C:\Users\emovi\Desktop\VibeCaster\VibeCaster\data\Software_5.json(1).gz

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emovi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [68]:
## load the datset into DataFrame as described on the website: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/#subsets
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('C:\\Users\\emovi\\Desktop\\VibeCaster\\VibeCaster\\data\\Industrial_and_Scientific_5.json.gz')

In [69]:
 # summary statistics
print(df.shape) # 
print(df.info())
print(df.describe())

(77071, 12)
<class 'pandas.core.frame.DataFrame'>
Index: 77071 entries, 0 to 77070
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         77071 non-null  float64
 1   verified        77071 non-null  bool   
 2   reviewTime      77071 non-null  object 
 3   reviewerID      77071 non-null  object 
 4   asin            77071 non-null  object 
 5   style           36037 non-null  object 
 6   reviewerName    77044 non-null  object 
 7   reviewText      77060 non-null  object 
 8   summary         77061 non-null  object 
 9   unixReviewTime  77071 non-null  int64  
 10  vote            9620 non-null   object 
 11  image           1719 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 7.1+ MB
None
            overall  unixReviewTime
count  77071.000000    7.707100e+04
mean       4.524062    1.454857e+09
std        0.949668    4.559407e+07
min        1.000000    1.051402e+0

In [70]:
print(df.isna().sum())

overall               0
verified              0
reviewTime            0
reviewerID            0
asin                  0
style             41034
reviewerName         27
reviewText           11
summary              10
unixReviewTime        0
vote              67451
image             75352
dtype: int64


In [71]:
# delete entrys with missing reviewText: 
df.dropna(subset=['reviewText'], inplace=True)

In [72]:
# only keep overall and reviewText
all_columns = df.columns.tolist()

columns_to_keep = ['overall', 'reviewText']

columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

df.drop(columns=columns_to_drop, inplace=True)

In [74]:
print(df.describe())

            overall
count  77060.000000
mean       4.524046
std        0.949636
min        1.000000
25%        4.000000
50%        5.000000
75%        5.000000
max        5.000000


In [75]:
# create new col sentiment to train the model on

def classify_sentiment(overall_score):
    if 1 <= overall_score <= 2:
        return 1  # Negative
    elif overall_score == 3:
        return 0  # Neutral
    elif 4 <= overall_score <= 5:
        return 2  # Positive

# Apply the function to the 'overall' column to create the 'sentiment' column
df['sentiment'] = df['overall'].apply(classify_sentiment)

In [76]:
print(df.head())

# Count the number of each sentiment
print(df['sentiment'].value_counts())


   overall                                         reviewText  sentiment
0      5.0  This worked really well for what I used it for...          2
1      5.0                   Fast cutting and good adheasive.          2
2      5.0  Worked great for my lapping bench.  I would li...          2
3      4.0                                      As advertised          2
4      5.0  seems like a pretty good value as opposed to b...          2
sentiment
2    68201
0     4442
1     4417
Name: count, dtype: int64


In [77]:
## dataset is heavily skewed on the positive side of things so might need to use cross-validation to account for it
# we only need reviewText and Sentiment to start training our model so lets drop the 'overall' column
df.drop(columns="overall", inplace=True)


In [78]:
# Specific Preprocessing Steps:

#     Text Cleaning:
#         Why: Raw text often contains punctuations, numbers, and special characters that don't contribute much to the sentiment.
#         How: Regular expressions or string manipulation techniques can be used for this.

#     Tokenization:
#         Why: Tokenization helps to break down the text into smaller pieces, often into words, which makes it easier for the algorithm to identify patterns.
#         How: Libraries like NLTK and spaCy provide tokenization methods.

#     Stemming/Lemmatization:
#         Why: Different forms of a word often convey the same sentiment (e.g., 'running' and 'ran'). Stemming and Lemmatization convert words to their base or root form.
#         How: Again, NLTK and spaCy have methods for these.

#     Removal of Stop Words:
#         Why: Commonly occurring words (like 'and', 'the', 'is') generally don't contribute to the sentiment and can be removed.
#         How: Predefined lists of stop words are available in NLTK and spaCy.

#     Feature Extraction:
#         Why: Machine learning algorithms require numerical input, and the text needs to be converted into a format like Bag-of-Words or TF-IDF that can be fed into these algorithms.
#         How: Scikit-learn provides CountVectorizer for Bag-of-Words and TfidfVectorizer for TF-IDF.

#     Handling Class Imbalance:
#         Why: Given that your dataset is imbalanced, using techniques to either oversample the minority class or undersample the majority class can make the model more fair.
#         How: Libraries like imblearn provide methods like SMOTE for oversampling.



#### Data Preprocessing####
## Step1: Text Cleaning ## 
# Remove punctuations
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub('[^\w\s]', '', x))

## Step 2: Tokenization ##
df['tokenized_reviewText'] = df['reviewText'].apply(word_tokenize)

In [80]:
## Step3: Lemmatization ## 
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')


def get_wordnet_pos(tag):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Function to apply lemmatization to a list of words with POS tagging
def lemmatize_with_pos(words):
    pos_tagged = nltk.pos_tag(words)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tagged]

# Apply POS tagging and lemmatization
df['lemmatized_reviewText'] = df['tokenized_reviewText'].apply(lemmatize_with_pos)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\emovi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [82]:
## Step4: Removal of Stop Words to reduce dimensionality and complexity ##
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(words):
    return [word for word in words if word.lower() not in stop_words]

df['no_stopwords_reviewText'] = df['lemmatized_reviewText'].apply(remove_stopwords)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emovi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [88]:
# cleaning up the DataFrame 
df = df[['no_stopwords_reviewText', 'sentiment']]
df.rename(columns={'no_stopwords_reviewText': 'reviewText'}, inplace=True)

In [94]:
## Step5: Feature Extraction ##
# Using Term Frequency-Inverse Document Frequency (TF-IDF) to get numerical representation of the text data# 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Fit and transform the data
df['reviewText'] = df['reviewText'].apply(' '.join)
X = vectorizer.fit_transform(df['reviewText'])
# Now, X is a sparse matrix containing the TF-IDF features




In [95]:
## Step6: Handling Class Imbalance##
# Using Synthetic Minority Over-sampling Technique (SMOTE) to generate synthetic data of minority class (neutral and negative sentiments)
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# Fit on data
X_resampled, y_resampled = smote.fit_resample(X, df['sentiment'])
