In [None]:
# imports
import pandas as pd
from transformers import pipeline
import json

In [15]:
# Reading in the json file 
raw_data = []
with open('Software_5.json', 'r') as file:
    for line in file:
        raw_data.append(json.loads(line))

raw_data[0:2]


[{'overall': 4.0,
  'verified': False,
  'reviewTime': '10 20, 2010',
  'reviewerID': 'A38NELQT98S4H8',
  'asin': '0321719816',
  'style': {'Format:': ' DVD-ROM'},
  'reviewerName': 'WB Halper',
  'reviewText': "I've been using Dreamweaver (and it's predecessor Macromedia's UltraDev) for many years.  For someone who is an experienced web designer, this course is a high-level review of the CS5 version of Dreamweaver, but it doesn't go into a great enough level of detail to find it very useful.\n\nOn the other hand, this is a great tool for someone who is a relative novice at web design.  It starts off with a basic overview of HTML and continues through the concepts necessary to build a modern web site.  Someone who goes through this course should exit with enough knowledge to create something that does what you want it do do...within reason.  Don't expect to go off and build an entire e-commerce system with only this class under your belt.\n\nIt's important to note that there's a long g

In [16]:
# converting the data into a dataframe
data_df = pd.DataFrame(raw_data)
data_df.head(5)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},WB Halper,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,1287532800,,
1,4.0,False,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},Grimmy,"The demo is done with the PC version, with ref...",A good value,1287360000,,
2,5.0,False,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},D. Fowler,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,1287187200,3.0,
3,5.0,False,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},Bryan Newman,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,1286841600,,
4,5.0,False,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},Al Swanson,I decided (after trying a number of other prod...,Excellent Tutorials!,1286409600,,


In [17]:
# dropping columns
drop_cols = ['verified', 'reviewerName', 'summary', 'unixReviewTime', 'vote', 'image']
data_df = data_df.drop(drop_cols, axis = 1 )
data_df.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewText
0,4.0,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},I've been using Dreamweaver (and it's predeces...
1,4.0,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},"The demo is done with the PC version, with ref..."
2,5.0,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},If you've been wanting to learn how to create ...
3,5.0,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},I've been creating websites with Dreamweaver f...
4,5.0,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},I decided (after trying a number of other prod...


In [21]:
# categorizing reviews based on overall rating
data_df['overall'] = data_df['overall'].replace({1 : 'negative', 2 : 'negative', 3 : 'neutral', 
                                                 4 : 'positive', 5: 'positive'})
data_df.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewText
0,positive,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},I've been using Dreamweaver (and it's predeces...
1,positive,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},"The demo is done with the PC version, with ref..."
2,positive,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},If you've been wanting to learn how to create ...
3,positive,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},I've been creating websites with Dreamweaver f...
4,positive,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},I decided (after trying a number of other prod...


In [27]:
data_df.to_csv('clean_reviews.csv', index = False)

In [18]:
# list of number of reviews from 2000-2018 in order 

year_list = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
             '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
count_list  =[]
for i in year_list: 
    year_df = data_df[data_df['reviewTime'].str.contains(i)]
    count_list.append(year_df.shape[0])

print(count_list)

[3, 32, 73, 75, 72, 115, 112, 325, 1127, 1125, 736, 823, 1052, 1426, 1987, 1816, 1270, 425, 211]


In [None]:

def make_df(year): 
    ''' This function takes in a year as a string and 
    returns a dataframe of all of the reviews from the year 
    with a new column that has just the year.
    '''
    year_col = []
    year_df = data_df[data_df['reviewTime'].str.contains(year)]
    for i in range(year_df.shape[0]):
        year_col.append(f'{year}')
    year_df['year'] = year_col
    return year_df 

In [25]:
# make data frames of each year
year_list = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
             '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
thousand_df = make_df(year_list[0])
one_df = make_df(year_list[1])
two_df = make_df(year_list[2])
three_df = make_df(year_list[3])
four_df = make_df(year_list[4])
five_df = make_df(year_list[5])
six_df = make_df(year_list[6])
seven_df = make_df(year_list[7])
eight_df = make_df(year_list[8])
nine_df = make_df(year_list[9])
ten_df = make_df(year_list[10])
eleven_df = make_df(year_list[11])
twelve_df = make_df(year_list[12])
thirteen_df = make_df(year_list[13])
fourteen_df = make_df(year_list[14])
fifteen_df = make_df(year_list[15])
sixteen_df = make_df(year_list[16])
seventeen_df = make_df(year_list[17])
eighteen_df = make_df(year_list[18])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['year'] = year_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['year'] = year_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['year'] = year_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [None]:
# testing the function
ten_df.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewText,year
0,positive,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},I've been using Dreamweaver (and it's predeces...,2010
1,positive,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},"The demo is done with the PC version, with ref...",2010
2,positive,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},If you've been wanting to learn how to create ...,2010
3,positive,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},I've been creating websites with Dreamweaver f...,2010
4,positive,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},I decided (after trying a number of other prod...,2010


In [28]:
# splitting the data into our three testing groups

zero_six_df = pd.concat([thousand_df, one_df, two_df, three_df, four_df, five_df, six_df])
seven_twelve_df = pd.concat([seven_df, eight_df, nine_df, ten_df, eleven_df, twelve_df])
thirteen_eighteen_df = pd.concat([thirteen_df, fourteen_df, fifteen_df, sixteen_df, seventeen_df, eighteen_df])

In [30]:
zero_six_df.head(4)

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewText,year
143,negative,"12 22, 2000",AE95Z3K6GVIC3,B00003JAU7,,It is worst piece of crap I ever had to instal...,2000
144,positive,"08 23, 2000",ARXU3FESTWMJJ,B00003JAU7,,"I got tired of the Win98 crashes, so decided t...",2000
145,positive,"06 26, 2000",A2G0O4Y8QE10AE,B00004TYCR,,I bought this program 2.5 years ago and have b...,2000
127,positive,"02 1, 2001",A1P4RH7KMJ1SV2,B00003IRBU,{'Format:': ' Video Game'},I have now played all 3 of the Nancy Drew myst...,2001
