Import Libraries

In [1]:
import pandas as pd
import numpy as np
import string

Mount Drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%cd /content/gdrive/MyDrive/Utility-Scoring-Of-Product-Reviews/Data

/content/gdrive/MyDrive/Utility-Scoring-Of-Product-Reviews/Data


Load Dataset

In [4]:
df = pd.read_csv('Cell_Phones_and_Accessories.csv')
df.head(4)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,6073894996,"[3, 4]",4.0,This is a nice charger but you can tell it was...,"03 20, 2013",A29OXVQRZ154KX,Don Powell,Solid Charger but RF noisy,1363738000.0
1,9861203192,"[25, 25]",5.0,"Bought this for my new AT&T; Galaxy SII, the c...","10 7, 2011",A1847XXVEA8QUK,Daniel Poole,Awesome case and cheap too,1317946000.0
2,9985537742,"[10, 11]",3.0,Works fine IF you don't move the phone. If yo...,"11 19, 2010",A2TOXBTDH3Y6R9,Rich H.,"Mediocre, even for the price",1290125000.0
3,998554627X,"[5, 5]",5.0,was somewhat concerned after reading some of t...,"02 23, 2013",A17X3JUWJY3AXG,Mary Lynn Thompson,would order again,1361578000.0


In [5]:
df.shape

(10738, 9)

Drop 'reviewerName','summary','unixReviewTime' columns

In [6]:
df.drop(columns=['reviewerName','summary','unixReviewTime'],inplace=True)

Check for NA values

In [7]:
df['reviewText'].isna().any()

True

Print unique values present in 'overall' column


In [8]:
unique_ratings = df['overall'].unique()
print (sorted(unique_ratings))

[1.0, 2.0, 3.0, 4.0, 5.0]


In [9]:
null_reviewText = df[df['reviewText'].isna()]

In [10]:
null_reviewText.head(4)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID
387,B0013G8PTS,"[3, 3]",5.0,,"02 5, 2013",A1F7YU6O5RU432
414,B0015RB39O,"[211, 250]",5.0,,"08 8, 2011",A3BX6DLLTHQ9GJ
963,B002VPE1OI,"[55, 57]",4.0,,"07 10, 2010",A31VDZOEJACIHH
998,B002YFDRHW,"[55, 60]",5.0,,"01 4, 2012",A2Z2EB8M60EOIK


Total number of rows before dropping NaN rows in 'reviewText' column

In [11]:
df['asin'].count()

10738

Drop NaN rows in 'reviewText' column

In [12]:
df.dropna(subset=['reviewText'],inplace=True)

Total number of rows after dropping NaN rows in 'reviewText' column (93 rows deleted)

In [13]:
df['asin'].count()

10701

 II. Convert 'reviewText' values into lowercase

Use str.lower() to convert string into lower case and store it in a new column 'preProcessed_reviewText'


In [14]:
df['preProcessed_reviewText'] = df['reviewText'].str.lower()

Verify that all values should be lower cased in 'preProcessed_reviewText' column

In [15]:
df['preProcessed_reviewText'].str.isupper().any()

False

III. Remove punctuation marks from 'reviewText'

First row with punctuation in the 'preProcessed_reviewText' column

In [16]:
df.head(1)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,preProcessed_reviewText
0,6073894996,"[3, 4]",4.0,This is a nice charger but you can tell it was...,"03 20, 2013",A29OXVQRZ154KX,this is a nice charger but you can tell it was...


In [17]:
translator=str.maketrans('','',string.punctuation)
df['preProcessed_reviewText'] = df['preProcessed_reviewText'].str.translate(translator)

First row after removing punctuation in the 'preProcessed_reviewText' column


In [18]:
df.head(1)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,preProcessed_reviewText
0,6073894996,"[3, 4]",4.0,This is a nice charger but you can tell it was...,"03 20, 2013",A29OXVQRZ154KX,this is a nice charger but you can tell it was...


IV. Remove words with numerical digits or no letters in 'reviewText'

Check for numeric values in 'preProcessed_reviewText' column

In [19]:
df[df['preProcessed_reviewText'].str.isnumeric()]

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,preProcessed_reviewText


Replace numeric values with no space in 'preProcessed_reviewText' column

In [20]:
df['preProcessed_reviewText'] = df['preProcessed_reviewText'].str.replace('\d+', '')

In [21]:
df.head(4)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,preProcessed_reviewText
0,6073894996,"[3, 4]",4.0,This is a nice charger but you can tell it was...,"03 20, 2013",A29OXVQRZ154KX,this is a nice charger but you can tell it was...
1,9861203192,"[25, 25]",5.0,"Bought this for my new AT&T; Galaxy SII, the c...","10 7, 2011",A1847XXVEA8QUK,bought this for my new att galaxy sii the case...
2,9985537742,"[10, 11]",3.0,Works fine IF you don't move the phone. If yo...,"11 19, 2010",A2TOXBTDH3Y6R9,works fine if you dont move the phone if you ...
3,998554627X,"[5, 5]",5.0,was somewhat concerned after reading some of t...,"02 23, 2013",A17X3JUWJY3AXG,was somewhat concerned after reading some of t...


V. Remove 'reviewText' with very short reviews in length

Find the average length of reviews in 'preProcessed_reviewText' column

In [22]:
df_len = df['preProcessed_reviewText'].str.len()
length = round(df_len.mean())

In [23]:
reviewText_within_lengthlimits = (df['preProcessed_reviewText'].str.len() >= 100)

Keep reviews within the limit of 100 characters in 'preProcessed_reviewText' column


In [24]:
df['preProcessed_reviewText'] = df['preProcessed_reviewText'].loc[reviewText_within_lengthlimits]

Verify that minimum length of 'preProcessed_reviewText' column should be 100

In [25]:
df['preProcessed_reviewText'].str.len().min()

100.0

Maximum length of reviews

In [26]:
df['preProcessed_reviewText'].str.len().max()

29990.0

Drop NAs

In [27]:
df.dropna(inplace=True)

Write to CSV

In [28]:
df.to_csv('Cell_Phones_and_Accessories.csv',index=False)

Verify if all the columns are added

In [29]:
df = pd.read_csv('Cell_Phones_and_Accessories.csv')

In [30]:
df.head(4)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,preProcessed_reviewText
0,6073894996,"[3, 4]",4.0,This is a nice charger but you can tell it was...,"03 20, 2013",A29OXVQRZ154KX,this is a nice charger but you can tell it was...
1,9861203192,"[25, 25]",5.0,"Bought this for my new AT&T; Galaxy SII, the c...","10 7, 2011",A1847XXVEA8QUK,bought this for my new att galaxy sii the case...
2,9985537742,"[10, 11]",3.0,Works fine IF you don't move the phone. If yo...,"11 19, 2010",A2TOXBTDH3Y6R9,works fine if you dont move the phone if you ...
3,998554627X,"[5, 5]",5.0,was somewhat concerned after reading some of t...,"02 23, 2013",A17X3JUWJY3AXG,was somewhat concerned after reading some of t...
