# The code for my research project
To run code, click on 'cell' and then 'run all'. The following code was used for the pre processing.

In [1]:
# Below is the code to open the json file and convert it to a pandas dataframe.
# This code is not mine; it was provided by the same people that provided the dataset.

import pandas as pd
import numpy as np
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("Industrial_and_Scientific_5.json.gz")

# To use another dataset: replace Industrial_and_Scientific_5.json.gz by other 5-core json dataset filename

In [2]:
 # Remove unnessecary columns

df = df.drop(['verified', 'reviewTime', 'style', 'reviewerName', 'unixReviewTime', 'image'], axis=1)
df

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote
0,5.0,A1JB7HFWHRYHT7,B0000223SI,This worked really well for what I used it for...,Couldn't have been happier with it's performance,
1,5.0,A2FCLJG5GV8SD6,B0000223SI,Fast cutting and good adheasive.,Good paper.,
2,5.0,A3IT9B33NWYQSL,B0000223SI,Worked great for my lapping bench. I would li...,Handy!,
3,4.0,AUL5LCV4TT73P,B0000223SK,As advertised,As advertised,
4,5.0,A1V3I3L5JKO7TM,B0000223SK,seems like a pretty good value as opposed to b...,seems like a pretty good value as opposed to b...,
...,...,...,...,...,...,...
77066,5.0,A1UZ9AVZFWZS1A,B01HCVJ3K2,So far it has worked like a champ. Great solut...,I recommend it.,
77067,5.0,A1PMSQXD43WIS4,B01HCVJ3K2,Great quality solid state relay. I used this s...,Great quality solid state relay,
77068,5.0,A225WHD7XZVIXL,B01HEQVQAK,Came with everything needed to install in my M...,Exactly as described,
77069,5.0,A3T05FOORNQI18,B01HEQVQAK,Installed a month ago in my Monoprice Maker Se...,Works Great,


In [3]:
# Optimizing the vote column for researching helpfulness votes

df['vote'] = df['vote'].fillna(0) # If vote value is missing: fill in a zero instead
df['vote'] = df['vote'].replace(',', '', regex=True) # If the number has a comma in it, remove it
df['vote'] = df['vote'].astype(int) # Convert all vote values from strings to integers

In [4]:
df['vote']

# No more missing vote values

0        0
1        0
2        0
3        0
4        0
        ..
77066    0
77067    0
77068    0
77069    0
77070    0
Name: vote, Length: 77071, dtype: int64

In [5]:
# Display the full review text

pd.set_option('display.max_colwidth', None) 

In [6]:
# Remove all duplicate entries of review text

df = df.sort_values(by='reviewText')
df = df.drop_duplicates(subset='reviewText', keep="first", ignore_index=True)

In [7]:
# Cleaning the review text

df['reviewText'] = df['reviewText'].str.replace('[^\w\s]','') # Remove punctuation
df['reviewText'] = df['reviewText'].replace('\n', ' ', regex=True) # Replace all newline chars with whitespace
df['reviewText'] = df['reviewText'].str.lower() # Convert all uppercase chars to lowercase

# Processing the data


In [8]:
# Add new column with the length of every review

df['reviewLength'] = df['reviewText'].str.split().str.len()
df.sample(n = 5)

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote,reviewLength
42432,1.0,A2Z99424KWK98J,B00MB3CV6K,this is the second solutch filament weve used and it will certainly be the last horrible to work with my advice if you use this brand filament and begin to have printing issues switch filament brand before spending hours troubleshooting machine,Horrible,0,41.0
55346,2.0,A2N9BWZX4HFPI0,B000OMHIHM,not compatible with schedule 40 pvc fittings,Two Stars,0,7.0
55327,2.0,A2L41PLWVW8G6E,B000UV6ZPS,not as good as i expected it would be needs to be backlighted like they show in the photo,Two Stars,0,19.0
30332,5.0,AIKIABYCYAYQZ,B00837ZGRY,one word to describe this excellent,Five Stars,0,6.0
888,5.0,ABODTODRWTUMH,B000WGHZYG,a lot of decent flap disks for a little money what not to like they work pretty well no problems so far,Good value,0,22.0


In [9]:
df.shape

# The amount of reviews that are left after cleaning:

(58331, 7)

In [10]:
df.describe()

# Some descriptive statistics on the dataset

Unnamed: 0,overall,vote,reviewLength
count,58331.0,58331.0,58330.0
mean,4.470625,1.566131,52.815189
std,1.002965,18.657533,107.04545
min,1.0,0.0,0.0
25%,4.0,0.0,9.0
50%,5.0,0.0,24.0
75%,5.0,0.0,57.0
max,5.0,2333.0,5946.0


In [11]:
sumlength = df.groupby('asin').vote.sum()
sumlength.head(10)

asin
B0000223SI     5
B0000223SK     5
B0000223UV    37
B00002246J     5
B0000224J0     0
B0000224MY    18
B0000225HB     0
B0000225HD     0
B0000225IO     0
B00002N6FE     3
Name: vote, dtype: int64

In [12]:
# Add column displaying the total amount of votes per individual product:
df['totalVotes'] = df.groupby('asin')["vote"].transform('sum')

# Add column displaying the votes for each review divided by the total votes per product:
df['votesPercentage'] = df['vote'] / df['totalVotes']

In [27]:
# Label each review text either short or long:
df['length'] = np.where(df['reviewLength']>=53, 'long', 'short')

# Label a review as helpful or not depending on the votesPercentage
df['veryHelpful'] = np.where(df['votesPercentage']>=0.5, 'yes', 'no')
df.sample(n=5)

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote,reviewLength,totalVotes,votesPercentage,length,veryHelpful
50168,4.0,AC312B7C7WFRO,B016A2PGH8,works great great price and product thanks,Four Stars,0,7.0,0,,short,no
53234,4.0,A1YAR74JNLV6YT,B00FHH7CG6,good product shipping packaging made it a messy blob of goo but i am able to tear off what i need easily so no big deal,good product for rv roof repair.,0,26.0,0,,short,no
36255,2.0,A12UWZ1O803KFU,B0000YHN9W,the outside dimensions and design of this line splitter are fine both of my clamp on ammeters fit the 1x and 10x holes well and the voltage meter probe holes are handy but i immediately noticed that when plugging into the female end the plugs tended to spring back out this was especially true for two blade plugs something was obviously wrong with the female receptacle contacts so i took the device apart to investigate sure enough the blade contacts were so far recessed into the housing that the plug blades barely made contact the internal shape of the plastic housing suggests that it was designed for longer blade contacts but short ones were used instead also one power wire was not soldered to the blade contact as were the other wires the unsoldered wire was simply pushed through a hole in the contact but someone forgot to solder it i modified the blade contacts so that plug blades would make good contact and i soldered the loose wire still however the female plug receptacle contacts are weak and will eventually break if this device is used often to address that problem i plugged a multiplug adapter into the female end and i plan to tape or glue the adapter into place that way the units internal weak blade contacts will not repeatedly be stressed by inserting plugs directly into the unit also the stiff solid wire inside is soldered to one of the plug receptacle flexible contacts that joint is bound to eventually break as the receptacle flexes how this device qualified for a ul listing ill never know but having spent about two hours modifying it i do plan to keep it and use it when needed the kill a watt power meters also sold by amazon are a better bet for most uses and far more versatile,"Extech electrical line splitter works, but has problems",22,310.0,100,0.22,long,no
0,5.0,A3D1AFK1WU0TG,B001PNO368,used for winch switch,,0,4.0,0,,short,no
45079,5.0,A3DDDXG42DOA0U,B007GDY3CU,used in my carbonation equipment to set and relieve any accidental over pressure,Safety Valve,0,13.0,4,0.0,short,no


# Results
Below are the results shown in a contingency table, showing the helpfulness in relation to review length.

In [14]:
contingency_table = pd.crosstab(df.veryHelpful, df.length, margins=True, margins_name="Total")
contingency_table

length,long,short,Total
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,13581,41674,55255
yes,2076,1000,3076
Total,15657,42674,58331


In [15]:
# The same results, but as percentages:
contingency_table = pd.crosstab(df.veryHelpful, df.length, normalize=True)
contingency_table

length,long,short
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.232826,0.71444
yes,0.03559,0.017144


In [16]:
dfc = df.copy() # Clone the dataset

# Reduce dataset to only include reviews that gave either one or five stars:
dfc = dfc.loc[(df['overall'] > 4) | (df['overall'] < 2)]

# Create new column labeling review as positive or negative:
dfc['sentiment'] = np.where(dfc['overall']==5, 'positive', 'negative')
dfc.head(5)

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote,reviewLength,totalVotes,votesPercentage,length,veryHelpful,sentiment
0,5.0,A3D1AFK1WU0TG,B001PNO368,used for winch switch,,0,4.0,0,,short,no,positive
6,5.0,A2D6HAJAC32XC0,B00MUT58Y2,everybody uses this stuff for a good reason nuff said a little goes a long way,"Works, no muss or fuss",0,16.0,308,0.0,short,no,positive
8,5.0,A35PPLVIPZLU36,B01F47B8AO,my size shot glasses,Great for drunks.,0,4.0,18,0.0,short,no,positive
9,5.0,A2VUW39TF5YCC1,B00TQ7DQU4,nice tape glows nicely,"Glows nicely. """,0,4.0,0,,short,no,positive
11,5.0,AQH4Z8W9WYE41,B00WW4H8XY,quick ship works great buy with confidence,"Works Great. "" Buy with confidence",0,7.0,0,,short,no,positive



Below is the second contingency table displaying the relation between helpfulness and sentiment.

In [17]:
contingency_table_2 = pd.crosstab(dfc.veryHelpful, dfc.sentiment, margins=True, margins_name="Total")
contingency_table_2

sentiment,negative,positive,Total
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,1967,39244,41211
yes,229,1963,2192
Total,2196,41207,43403


In [18]:
contingency_table_2 = pd.crosstab(dfc.veryHelpful, dfc.sentiment, normalize=True)
contingency_table_2

sentiment,negative,positive
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.045319,0.904177
yes,0.005276,0.045227
