# The code for my research project
To run code, click on 'cell' and then 'run all'. The following code was used for the pre processing.

In [1]:
# Below is the code to open the json file and convert it to a pandas dataframe.
# The code in this first cell is not mine; it was provided by the same people that provided the dataset.

import pandas as pd
import numpy as np
import gzip
import json

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("Industrial_and_Scientific_5.json.gz")

# To use another dataset: replace Industrial_and_Scientific_5.json.gz by other 5-core json dataset filename

In [2]:
 # Remove unnessecary columns

df = df.drop(['verified', 'reviewTime', 'style', 'reviewerName', 'unixReviewTime', 'image'], axis=1)
df

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote
0,5.0,A1JB7HFWHRYHT7,B0000223SI,This worked really well for what I used it for...,Couldn't have been happier with it's performance,
1,5.0,A2FCLJG5GV8SD6,B0000223SI,Fast cutting and good adheasive.,Good paper.,
2,5.0,A3IT9B33NWYQSL,B0000223SI,Worked great for my lapping bench. I would li...,Handy!,
3,4.0,AUL5LCV4TT73P,B0000223SK,As advertised,As advertised,
4,5.0,A1V3I3L5JKO7TM,B0000223SK,seems like a pretty good value as opposed to b...,seems like a pretty good value as opposed to b...,
...,...,...,...,...,...,...
77066,5.0,A1UZ9AVZFWZS1A,B01HCVJ3K2,So far it has worked like a champ. Great solut...,I recommend it.,
77067,5.0,A1PMSQXD43WIS4,B01HCVJ3K2,Great quality solid state relay. I used this s...,Great quality solid state relay,
77068,5.0,A225WHD7XZVIXL,B01HEQVQAK,Came with everything needed to install in my M...,Exactly as described,
77069,5.0,A3T05FOORNQI18,B01HEQVQAK,Installed a month ago in my Monoprice Maker Se...,Works Great,


In [3]:
# Optimizing the vote column for researching helpfulness votes

df['vote'] = df['vote'].fillna(0) # If vote value is missing: fill in a zero instead
df['vote'] = df['vote'].replace(',', '', regex=True) # If the number has a comma in it, remove it
df['vote'] = df['vote'].astype(int) # Convert all vote values from strings to integers

In [4]:
df['vote']

# No more missing vote values

0        0
1        0
2        0
3        0
4        0
        ..
77066    0
77067    0
77068    0
77069    0
77070    0
Name: vote, Length: 77071, dtype: int64

In [5]:
# Display the full review text

pd.set_option('display.max_colwidth', None) 

In [6]:
# Remove all duplicate entries of review text

df = df.sort_values(by='reviewText')
df = df.drop_duplicates(subset='reviewText', keep="first", ignore_index=True)

In [7]:
# Cleaning the review text

df['reviewText'] = df['reviewText'].str.replace('[^\w\s]','') # Remove punctuation
df['reviewText'] = df['reviewText'].replace('\n', ' ', regex=True) # Replace all newline chars with whitespace
df['reviewText'] = df['reviewText'].str.lower() # Convert all uppercase chars to lowercase

# Processing the data


In [8]:
# Add new column with the length of every review

df['reviewLength'] = df['reviewText'].str.split().str.len()
df.sample(n = 5)

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote,reviewLength
1124,5.0,A2HGLZ1FEBSFDU,B007D5LJ50,a very well made product if you need one,Its Fitting!,0,9.0
2501,5.0,A3MWZNX4P7XPP0,B00XBLLG5S,as described will buy again,Five Stars,0,5.0
20259,4.0,A1E8S0XD8M7MTD,B0013TVEWA,i think the toothbrush is great i like that it has a one minute and two minute brush settings and also the easy mode my disappointment is that when the battery wears out then you must throw it away it would be nice to be able to replace the battery it does a great job of brushing without hurting your gums make sure not to keep it plugged in all the time,wish you could replace the battery,0,72.0
56326,5.0,AB4ELQX285UKL,B0010VOJHS,the best wound cleanser,best,0,4.0
21758,5.0,AJ2ICMO59T3H,B00LKQBHVO,i was undecided between the neato botvac d85 and this roomba 870 and had played with both for about a week prior to returning the neato i wont bash the neato in fact it was a close call since both are impressive robots and vacuum cleaners ultimately i chose the roomba based on three criteria 1 ability to clean both bots can clean and in all actuality the roomba only pulled out slightly ahead of the neato in terms of my nonscientific tests involving chex cereal and sugar tossed on the floor the roombas name is optimus grime by the way i read someplace that 70 of robot vacuum owners name their robots odd anyway it seems that the smaller the space the more roombas somewhat random method of traversing that space results in thoroughness and multiple passes it isnt entirely random while watching it clean it is clear in some cases that there is method to its madness im still constantly impressed by how much it picks up in terms of pure navigation the neato is more logical it fires up out of the gate spins its laser turret and maps the room in most cases its movements were very precise while threading around furniture but it often only did one pass if it didnt get something the first time well meh both were great at my typical messes cat fur litter scattered everyplace and dust bunnies i do like how the exhaust from the roomba pointed slightly upwards so as not to accidentally scatter debris around that it hasnt collected yet 2 perceived robustness irobot has been making robots since 2002 it also dabbles in military and police grade robots aside from making cute vacuums and autonomous pool cleaners it is my understanding i didnt check up on it that neato is a newer company and one that apparently has some kinks to work out neither machine is perfect but informal google searches show that roomba owners have been generally using their robots for years only needing standard maintenance neato owners have complained about glitchy software and flimsy plastic parts the week i had my d85 i received the infamous please clear my path error there was nothing in its way at all it also got fixated on a corner of my kitchen and just sort of spun around these things are pricey and although they ship with manufacturers warranties id prefer to let the product work as advertised and not hassling the company for technical assistance which leads me to 3 customer service every single email or live chat ive had with irobot customer service has resulted in a prompt reply that wasnt some boilerplate message i was concerned that my roomba was running into my black furniture fairly hard it has a tough time seeing black and contacted customer service to see if that was normal although it was they shipped me out a free set of bumper extenders that i can place on the roomba perfect solution nope but they tried and im sure the bumpers will help stave off potential damage now the three times i contacted neato i got nothing zip tumbleweeds were rolling across my screen i need to know that a company that is selling a complex and expensive product will be their to help with issues in conclusion both devices have their merits and downfalls but neither is incompetent as a robot or a vacuum they both have different means as to how they go about their duties i really had to take time and think about which would stay home and which would be returned im happy with my optimus grime but im sure either device would be helpful in your home,Pseudo Review: Roomba 870 vs Neato D85,5,623.0


In [9]:
df.shape

# The amount of reviews that are left after cleaning:

(58331, 7)

In [10]:
df.describe()

# Some descriptive statistics on the dataset

Unnamed: 0,overall,vote,reviewLength
count,58331.0,58331.0,58330.0
mean,4.470625,1.566131,52.815189
std,1.002965,18.657533,107.04545
min,1.0,0.0,0.0
25%,4.0,0.0,9.0
50%,5.0,0.0,24.0
75%,5.0,0.0,57.0
max,5.0,2333.0,5946.0


In [11]:
# Add column displaying the total amount of votes per individual product:
df['totalVotes'] = df.groupby('asin')["vote"].transform('sum')

# Add column displaying the votes for each review divided by the total votes per product:
df['votesPercentage'] = df['vote'] / df['totalVotes']

In [12]:
# Label each review text either short or long
# Important: when using a different dataset, make sure to change the number 53 to the mean review length for the
# dataset you're using.

df['length'] = np.where(df['reviewLength']>=53, 'long', 'short')

# Label a review as helpful or not depending on the votesPercentage
df['veryHelpful'] = np.where(df['votesPercentage']>=0.5, 'yes', 'no')
df.sample(n=5)

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote,reviewLength,totalVotes,votesPercentage,length,veryHelpful
48360,5.0,A8F0V3GKB0MDA,B00J0HC4QC,will buy again thanks again david a,Will buy again Thanks again David,0,7.0,12,0.0,short,no
45362,5.0,A15496FU4AE0C0,B006U1LTDU,used to build home brew system all works well quality product,Five Stars,0,11.0,0,,short,no
33969,5.0,AEIU1COAWWI6M,B00MB3CV6K,smooth prints every time from every solutech ive ever used,Another Solutech success,0,10.0,205,0.0,short,no
17872,5.0,AVB6IOINZE2IL,B01A6ORBGA,i have used a lot of 3d solutech filament in wide variety of colors and materials i have used their pla petg as well as the glow pla offered so far all of the filaments i have used have printed well performed as described and given me little to no issue the petg works well prints with no stringing on my prusa mk2s i used the stock settings on slic3r prusa edition and it worked fine i have used other brands of petg and most of them are good quality as well but i like the fact also that 3d solutech is based in the usa and i like supporting local usa companies the only negative thing i could say is i wish they offered a few more colors of petg especially on amazon since i shop here to much,Prints Well - USA Based - Good Support (just need more colors on Amazon),0,140.0,9,0.0,long,no
25558,5.0,AUOSTLWHHVBBH,B01C3HEQZC,just installed this kit on a flashforge creator pro very easy to install and printed flawlessly the first time i dont have the experience to know if it prints better than the original hotend but it certainly looks great ive only printed pla esun with it so far so i need to break out the petg and abs to see what results i get,Very easy to install and printed flawlessly the first time,0,64.0,33,0.0,long,no


# Results
Below are the results shown in a contingency table, showing the helpfulness in relation to review length.

In [13]:
contingency_table = pd.crosstab(df.veryHelpful, df.length, margins=True, margins_name="Total")
contingency_table

length,long,short,Total
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,13581,41674,55255
yes,2076,1000,3076
Total,15657,42674,58331


In [14]:
# The same results, but as percentages:
contingency_table = pd.crosstab(df.veryHelpful, df.length, normalize=True)
contingency_table

length,long,short
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.232826,0.71444
yes,0.03559,0.017144


In [23]:
dfc = df.copy() # Clone the dataset

# Reduce dataset to only include reviews that gave either one or five stars:
dfc = dfc.loc[(df['overall'] > 4) | (df['overall'] < 2)]

# Create new column labeling review as positive or negative:
dfc['sentiment'] = np.where(dfc['overall']==5, 'positive', 'negative')
dfc.sample(n = 5)

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote,reviewLength,totalVotes,votesPercentage,length,veryHelpful,sentiment
46144,5.0,A4I2OQCZ8MUOB,B016Q6T7Q4,very handy for prototyping as a little terminal strip or component holder used two of them in my meche senior design project,Very handy for prototyping as a little terminal strip or ...,0,22.0,20,0.0,short,no,positive
16765,5.0,A3TMM5FTEX9GUZ,B0000YHN9W,i got this to replace the homemade version i had in my tool bag made from sjoow and nema plugsocket combo you really should have one of these with your clamp meter for checking common 15a devices because its very useful for the price the x10 section is great for measuring very low drain devices that may not otherwise register a reading on your clamp meter also,Make your clamp meter great again,0,67.0,100,0.0,long,no,positive
36404,5.0,A1UPEAN1SGTM98,B002NQPEE6,the product is excellent to get to hard to reach areas highly recommended,Five Stars,0,13.0,0,,short,no,positive
38107,5.0,A2HAX3BHXHQU3Y,B0018BNPKM,these beakers were stronger than the ones that i typically buy and i loved them while i had unfortunately a few of them broke while i was moving and those that didnt break are sitting in a storage unit but they were good while they lasted handle with care and they should be fine,... than the ones that I typically buy and I loved them while I had,0,54.0,0,,long,no,positive
17863,5.0,A3GF0NIB9U6VVH,B00EUKHACW,i have used shark vacuums for many years and have loved every one however this upright is my favorite it is so user friendly and glides over carpeting with ease the only thing i do not like isif you are in the middle of a room and want to turn it off and leave it unattended for a minute because it is top heavy you have to lay it down it does not stand alone the attachments are really great and the mini attachments fit in the tiniest places my dryer vent has never been so clean i would recommend this model to everyone,The best and my favorite Shark,0,104.0,113,0.0,long,no,positive



Below is the second contingency table displaying the relation between helpfulness and sentiment.

In [20]:
contingency_table_2 = pd.crosstab(dfc.veryHelpful, dfc.sentiment, margins=True, margins_name="Total")
contingency_table_2

sentiment,negative,positive,Total
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,1967,39244,41211
yes,229,1963,2192
Total,2196,41207,43403


In [17]:
contingency_table_2 = pd.crosstab(dfc.veryHelpful, dfc.sentiment, normalize=True)
contingency_table_2

# In percentages:

sentiment,negative,positive
veryHelpful,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.045319,0.904177
yes,0.005276,0.045227
