In [60]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [61]:
data = pd.read_json("Data.json", lines=True)

In [62]:
print("Missing Values:")
print(data.isnull().sum())

Missing Values:
reviewerID         0
asin               0
reviewerName      27
helpful            0
reviewText         0
overall            0
summary            0
unixReviewTime     0
reviewTime         0
dtype: int64


In [63]:
data['reviewerName'].fillna("Unknown", inplace=True)

print("Missing Values After Imputation:")
print(data.isnull().sum())

Missing Values After Imputation:
reviewerID        0
asin              0
reviewerName      0
helpful           0
reviewText        0
overall           0
summary           0
unixReviewTime    0
reviewTime        0
dtype: int64


In [64]:
data['reviewText'] = data['reviewText'].fillna('Missing')

In [65]:
data.isnull().sum()

reviewerID        0
asin              0
reviewerName      0
helpful           0
reviewText        0
overall           0
summary           0
unixReviewTime    0
reviewTime        0
dtype: int64

In [66]:
print('The distribution of categories of sentiment:') 
data['overall'].value_counts()

The distribution of categories of sentiment:


overall
5    6938
4    2084
3     772
2     250
1     217
Name: count, dtype: int64

In [67]:
data['reviews'] = data['reviewText'] + data['summary']
data = data.drop(['reviewText', 'summary'], axis=1)
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,reviewTime,reviews
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",5,1393545600,"02 28, 2014","Not much to write about here, but it does exac..."
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",5,1363392000,"03 16, 2013",The product does exactly as it should and is q...
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",5,1377648000,"08 28, 2013",The primary job of this device is to block the...
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",5,1392336000,"02 14, 2014",Nice windscreen protects my MXL mic and preven...
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",5,1392940800,"02 21, 2014",This pop filter is great. It looks and perform...


In [68]:
def sent(rating):
    if rating['overall'] == 3:
        value = 'Neutral'
    elif rating['overall'] > 3:
        value = 'Positive'
    else:
        value = 'Negative'
    return value

In [69]:
data['Sentiment'] = data.apply(sent, axis=1)
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,reviewTime,reviews,Sentiment
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",5,1393545600,"02 28, 2014","Not much to write about here, but it does exac...",Positive
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",5,1363392000,"03 16, 2013",The product does exactly as it should and is q...,Positive
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",5,1377648000,"08 28, 2013",The primary job of this device is to block the...,Positive
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",5,1392336000,"02 14, 2014",Nice windscreen protects my MXL mic and preven...,Positive
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",5,1392940800,"02 21, 2014",This pop filter is great. It looks and perform...,Positive


In [70]:
print('The count of sentiments:')
data['Sentiment'].value_counts()

The count of sentiments:


Sentiment
Positive    9022
Neutral      772
Negative     467
Name: count, dtype: int64

In [71]:
# Splitting the date 
re_new = data["reviewTime"].str.split(",", n = 1, expand = True) 
  
# adding month to the main dataset 
data["date"] = re_new[0] 
  
# adding day to the main dataset 
data["year"] = re_new[1] 

re_new1 = data["date"].str.split(" ", n = 1, expand = True) 

# adding month to the main dataset 
data["month"] = re_new1[0] 

# adding day to the main dataset 
data["day"] = re_new1[1]

data = data.drop(['reviewTime', 'date'], axis=1)
data.head() 

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,reviews,Sentiment,year,month,day
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",5,1393545600,"Not much to write about here, but it does exac...",Positive,2014,2,28
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",5,1363392000,The product does exactly as it should and is q...,Positive,2013,3,16
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",5,1377648000,The primary job of this device is to block the...,Positive,2013,8,28
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",5,1392336000,Nice windscreen protects my MXL mic and preven...,Positive,2014,2,14
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",5,1392940800,This pop filter is great. It looks and perform...,Positive,2014,2,21


In [72]:
print('Year - wise count of sentiments:')
data.groupby(['year','Sentiment']).size()

Year - wise count of sentiments:


year   Sentiment
 2004  Positive        7
 2005  Positive        4
 2006  Negative        1
       Neutral         1
       Positive        8
 2007  Negative        1
       Positive       21
 2008  Negative        2
       Neutral         7
       Positive       54
 2009  Negative        5
       Neutral         8
       Positive      115
 2010  Negative       16
       Neutral        20
       Positive      314
 2011  Negative       46
       Neutral        84
       Positive      877
 2012  Negative       85
       Neutral       130
       Positive     1721
 2013  Negative      170
       Neutral       319
       Positive     3566
 2014  Negative      141
       Neutral       203
       Positive     2335
dtype: int64

In [75]:
# Convert the 'helpful' column to string
data['helpful'] = data['helpful'].astype(str)

In [77]:
newreview = data["helpful"].str.split(",", n = 1, expand = True)
newreview1 = newreview[0].str.split("[", n = 1, expand = True)
newreview2 = newreview[1].str.split("]", n = 1, expand = True)

#Resetting the index
newreview1.reset_index(drop=True, inplace=True)
newreview2.reset_index(drop=True, inplace=True)

#Dropping empty columns due to splitting 
newreview1 = newreview1.drop([0], axis=1)
newreview2 = newreview2.drop([1], axis=1)

#Concatenating the splitted columns
helpfulreview = pd.concat([newreview1, newreview2], axis=1)
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,reviews,Sentiment,year,month,day
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]",5,1393545600,"Not much to write about here, but it does exac...",Positive,2014,2,28
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",5,1363392000,The product does exactly as it should and is q...,Positive,2013,3,16
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",5,1377648000,The primary job of this device is to block the...,Positive,2013,8,28
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",5,1392336000,Nice windscreen protects my MXL mic and preven...,Positive,2014,2,14
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",5,1392940800,This pop filter is great. It looks and perform...,Positive,2014,2,21


In [80]:
#Converting into integer types
helpfulreview[0] = helpfulreview[0].astype(str).astype(int)
helpfulreview[1] = helpfulreview[1].astype(str).astype(int)

#Dividing the two columns, we have 0 in the second columns when dvided gives error, so I'm ignoring those errors
try:
  helpfulreview['result'] = helpfulreview[1]/helpfulreview[0]
except ZeroDivisionError:
  helpfulreview['result']=0

#Filling the NaN values(created due to dividing) with 0
helpfulreview['result'] = helpfulreview['result'].fillna(0)

#Rounding of the results to two decimal places
helpfulreview['result'] = helpfulreview['result'].round(2) 

#Attaching the results to a new column of the main dataframe
data['helpful_review'] = helpfulreview['result']

#dropping the helpful column from main dataframe
data = data.drop(['helpful'], axis=1)
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,overall,unixReviewTime,reviews,Sentiment,year,month,day,helpful_review
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...",5,1393545600,"Not much to write about here, but it does exac...",Positive,2014,2,28,0.0
1,A14VAT5EAX3D9S,1384719342,Jake,5,1363392000,The product does exactly as it should and is q...,Positive,2013,3,16,0.93
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""",5,1377648000,The primary job of this device is to block the...,Positive,2013,8,28,1.0
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""",5,1392336000,Nice windscreen protects my MXL mic and preven...,Positive,2014,2,14,0.0
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,5,1392940800,This pop filter is great. It looks and perform...,Positive,2014,2,21,0.0


In [82]:
data['helpful_review'].value_counts()

helpful_review
0.00    7215
1.00    2040
0.50     266
0.67     136
0.75     111
        ... 
0.56       1
0.15       1
0.13       1
0.43       1
0.69       1
Name: count, Length: 65, dtype: int64

In [86]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

# Define a function to clean text
def clean_text(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

data['cleaned_reviewText'] = data['reviews'].apply(clean_text)

print(data['cleaned_reviewText'].head())

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    much write exactly supposed filters pop sounds...
1    product exactly quite affordablei realized dou...
2    primary job device block breath would otherwis...
3    nice windscreen protects mxl mic prevents pops...
4    pop filter great looks performs like studio fi...
Name: cleaned_reviewText, dtype: object


In [88]:
le = LabelEncoder() 
  
data['Sentiment']= le.fit_transform(data['Sentiment']) 

data['Sentiment'].unique() 

array([2, 1, 0])

In [89]:
data['Sentiment'].value_counts()

Sentiment
2    9022
1     772
0     467
Name: count, dtype: int64

In [90]:
data.to_csv("preprocessed_data.csv", index=False)
print("Preprocessed dataset saved as 'preprocessed_data.csv'")

Preprocessed dataset saved as 'preprocessed_data.csv'
