In [1]:
import pandas as pd
import numpy as np
import gzip
import json

In [2]:
def parse(path, num_rows):
  g = gzip.open(path, 'rb')
  i = 0
  for l in g:
    yield json.loads(l)
    i += 1
    if i == num_rows:
      break

def getDF(path, num_rows):
  i = 0
  df = {}
  for d in parse(path, num_rows):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

#num_rows = 50000
df = getDF('./data/reviews_Movies_and_TV_5.json.gz', 1500000)

In [3]:
train_sample_size=100000
test_sample_size=20000

In [4]:
df['reviewText'] = df['reviewText'].str.replace('"','')
df['summary'] = df['summary'].str.replace('"','')

In [5]:
bert_training = df.tail(100000)
df = df.head(1400000)

In [27]:
# Combine the rotten tomatoes and amazon dataset
rotten_df = pd.read_csv('./data/rotten_tomatoes_critic_reviews.csv.zip')
rotten_df = rotten_df[rotten_df.critic_name.notna()]
rotten_df = rotten_df[rotten_df.review_content.notna()]

rotten_df_num_rows = 100000
rotten_df = rotten_df[['review_content']].sample(n = rotten_df_num_rows)

rotten_df['review_content'] = rotten_df['review_content'].str.replace('"','')

rotten_df['dataSource'] = np.ones(len(rotten_df))
bert_training['dataSource'] = np.zeros(len(bert_training))

mix_df = bert_training[['reviewText', 'dataSource']]
mix_df.columns = ['review_content', 'dataSource']
mix_df = mix_df.append(rotten_df, ignore_index = True)

mix_df

Unnamed: 0,review_content,dataSource
0,The original cast is back (sans one) and they ...,0.0
1,I really enjoyed this movie.Great story line. ...,0.0
2,I love this series and this latest one is of c...,0.0
3,"**1/2Fast Five, so named because it is the fif...",0.0
4,Slow to get going but after that.... lots of b...,0.0
...,...,...
199995,"Because we haven't been told what to think, th...",1.0
199996,It's a rare piece of family entertainment that...,1.0
199997,The strength of director Jacques Audiard's fil...,1.0
199998,"The film is beautiful to look at, rotoscoped w...",1.0


In [31]:
mix_df.to_csv('./preprocessed/bert_input/mix_df.csv')
# bert_training.to_pickle('./preprocessed/bert_input/bert_training.pkl')

In [7]:
df = df.rename(columns={'asin':'movieID'})
df

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ADZPIG9QOCDG5,0005019281,"Alice L. Larson ""alice-loves-books""","[0, 0]",This is a charming version of the classic Dick...,4.0,good version of a classic,1203984000,"02 26, 2008"
1,A35947ZP82G7JH,0005019281,Amarah Strack,"[0, 0]",It was good but not as emotionally moving as t...,3.0,Good but not as moving,1388361600,"12 30, 2013"
2,A3UORV8A9D5L2E,0005019281,Amazon Customer,"[0, 0]","Don't get me wrong, Winkler is a wonderful cha...",3.0,Winkler's Performance was ok at best!,1388361600,"12 30, 2013"
3,A1VKW06X1O2X7V,0005019281,"Amazon Customer ""Softmill""","[0, 0]",Henry Winkler is very good in this twist on th...,5.0,It's an enjoyable twist on the classic story,1202860800,"02 13, 2008"
4,A3R27T4HADWFFJ,0005019281,BABE,"[0, 0]",This is one of the best Scrooge movies out. H...,4.0,Best Scrooge yet,1387670400,"12 22, 2013"
...,...,...,...,...,...,...,...,...,...
1399995,A13IAD6WL2Y8D0,B004EPYZQC,Revae,"[1, 1]",Let me preface this review by saying that if y...,5.0,My favorite of the movies,1322438400,"11 28, 2011"
1399996,A3OBJ6YTIK70LJ,B004EPYZQC,"ricky norris ""sly""","[0, 0]",I have seen a lot of movies involving a bank r...,5.0,THE FAST vs. THE FURIOUS.,1378857600,"09 11, 2013"
1399997,A2GISQ90CZVKZC,B004EPYZQC,RKG,"[0, 0]",It is hard to review a song or video... what c...,5.0,Love it.,1370131200,"06 2, 2013"
1399998,A218E0QIPUKP4B,B004EPYZQC,"R. M. Bigger ""dragon keeper""","[0, 0]",I liked it. You get exactly what you expect f...,4.0,Very action packed.,1356480000,"12 26, 2012"


In [8]:
df = df.groupby(['reviewerID']).filter(lambda x: len(x) >= 5)

In [9]:
df['reviewTime'] = pd.to_datetime(df['reviewTime'])

In [10]:
df = df.sort_values(by='reviewTime')
df

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
263519,A1127LKNR08JJK,630174411X,acwpython@juno.com,"[0, 4]",The movie was ok but it did lack a few things....,4.0,GREAT,879379200,1997-11-13
17471,A1127LKNR08JJK,0767824571,acwpython@juno.com,"[0, 0]","This movie rules. It has humor, love, suspens...",5.0,Best Movie Ever!!!,879465600,1997-11-14
49404,A1127LKNR08JJK,0783222068,acwpython@juno.com,"[0, 0]",HA HA HA!!! THose were the first words anyone ...,5.0,Hilarious,879465600,1997-11-14
26150,A1127LKNR08JJK,0780022181,acwpython@juno.com,"[1, 2]",This is one of the best movies in the world. ...,5.0,This should get a 15,879897600,1997-11-19
355639,A1127LKNR08JJK,6303094880,acwpython@juno.com,"[2, 3]",This is so funny. It has a lot of good jokes ...,5.0,Awesome,879897600,1997-11-19
...,...,...,...,...,...,...,...,...,...
226605,AX3YHACFZOI40,6300247406,,"[0, 0]",funny and weird,5.0,Five Stars,1405900800,2014-07-21
786104,AX3YHACFZOI40,B0000VCZMK,,"[0, 0]",gross but funny too!,5.0,Five Stars,1405900800,2014-07-21
305722,AX3YHACFZOI40,6302283612,,"[0, 0]",scary great movie,5.0,Five Stars,1405900800,2014-07-21
492742,AX3YHACFZOI40,B000005XN6,,"[0, 0]",took 30 years to find. worth the wait!,5.0,Five Stars,1405900800,2014-07-21


In [11]:
df.groupby('reviewerID').cumcount()

263519     0
17471      1
49404      2
26150      3
355639     4
          ..
226605     4
786104     5
305722     6
492742     7
1056025    7
Length: 1303092, dtype: int64

In [12]:
df['reviewer_count'] = df.groupby('reviewerID').cumcount()

In [13]:
df.head(5)

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,reviewer_count
263519,A1127LKNR08JJK,630174411X,acwpython@juno.com,"[0, 4]",The movie was ok but it did lack a few things....,4.0,GREAT,879379200,1997-11-13,0
17471,A1127LKNR08JJK,0767824571,acwpython@juno.com,"[0, 0]","This movie rules. It has humor, love, suspens...",5.0,Best Movie Ever!!!,879465600,1997-11-14,1
49404,A1127LKNR08JJK,0783222068,acwpython@juno.com,"[0, 0]",HA HA HA!!! THose were the first words anyone ...,5.0,Hilarious,879465600,1997-11-14,2
26150,A1127LKNR08JJK,0780022181,acwpython@juno.com,"[1, 2]",This is one of the best movies in the world. ...,5.0,This should get a 15,879897600,1997-11-19,3
355639,A1127LKNR08JJK,6303094880,acwpython@juno.com,"[2, 3]",This is so funny. It has a lot of good jokes ...,5.0,Awesome,879897600,1997-11-19,4


In [14]:
train = df.loc[df['reviewer_count']<5]
test = df.loc[df['reviewer_count']>=5]

In [15]:
train = train.reset_index(drop=True)
train

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,reviewer_count
0,A1127LKNR08JJK,630174411X,acwpython@juno.com,"[0, 4]",The movie was ok but it did lack a few things....,4.0,GREAT,879379200,1997-11-13,0
1,A1127LKNR08JJK,0767824571,acwpython@juno.com,"[0, 0]","This movie rules. It has humor, love, suspens...",5.0,Best Movie Ever!!!,879465600,1997-11-14,1
2,A1127LKNR08JJK,0783222068,acwpython@juno.com,"[0, 0]",HA HA HA!!! THose were the first words anyone ...,5.0,Hilarious,879465600,1997-11-14,2
3,A1127LKNR08JJK,0780022181,acwpython@juno.com,"[1, 2]",This is one of the best movies in the world. ...,5.0,This should get a 15,879897600,1997-11-19,3
4,A1127LKNR08JJK,6303094880,acwpython@juno.com,"[2, 3]",This is so funny. It has a lot of good jokes ...,5.0,Awesome,879897600,1997-11-19,4
...,...,...,...,...,...,...,...,...,...,...
448565,AX3YHACFZOI40,B000058TIE,,"[0, 0]",funny!,5.0,Five Stars,1405900800,2014-07-21,0
448566,AX3YHACFZOI40,B0000542CM,,"[0, 0]",love him!!!,5.0,Five Stars,1405900800,2014-07-21,1
448567,AX3YHACFZOI40,0792843606,,"[0, 0]",great classic movie,5.0,Five Stars,1405900800,2014-07-21,2
448568,AX3YHACFZOI40,6303588980,,"[0, 0]",fun movie,5.0,Five Stars,1405900800,2014-07-21,3


In [16]:
test = test.reset_index(drop=True)
test

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,reviewer_count
0,A1127LKNR08JJK,6302952727,acwpython@juno.com,"[2, 51]",THis is the shortest movie but the funniest. ...,5.0,no,879897600,1997-11-19,5
1,A1127LKNR08JJK,6300248526,acwpython@juno.com,"[0, 4]",This is one of the greatest shows i have ever ...,5.0,why do we have to fill this out?,879897600,1997-11-19,6
2,A1127LKNR08JJK,079283318X,acwpython@juno.com,"[0, 0]",This is the best 007 movie ever. It has a lot...,5.0,Suspensful,882489600,1997-12-19,7
3,A25W58TZK09YET,079074774X,Charles Culbertson (cculbert@cfw.com),"[13, 14]",Errol Flynn made several films about World War...,5.0,Exceptional Combat Drama,892771200,1998-04-17,5
4,A25W58TZK09YET,6302682606,Charles Culbertson (cculbert@cfw.com),"[56, 59]",&quot;The Edge of Darkness&quot; is one of the...,5.0,Gripping World War II Drama,892771200,1998-04-17,6
...,...,...,...,...,...,...,...,...,...,...
854517,A11YIHB6IW352W,B00006FDHI,Theodore J. Jackson Sr.,"[0, 0]",One of my favorite movies. Have watched it sev...,5.0,Five Stars,1405900800,2014-07-21,21
854518,AX3YHACFZOI40,B0000VCZMK,,"[0, 0]",gross but funny too!,5.0,Five Stars,1405900800,2014-07-21,5
854519,AX3YHACFZOI40,6302283612,,"[0, 0]",scary great movie,5.0,Five Stars,1405900800,2014-07-21,6
854520,AX3YHACFZOI40,B000005XN6,,"[0, 0]",took 30 years to find. worth the wait!,5.0,Five Stars,1405900800,2014-07-21,7


In [34]:
train_sample = train.loc[:train_sample_size-1,['reviewerID','movieID','reviewText','summary','overall']]
train_sample

Unnamed: 0,reviewerID,movieID,reviewText,summary,overall
0,A1127LKNR08JJK,630174411X,The movie was ok but it did lack a few things....,GREAT,4.0
1,A1127LKNR08JJK,0767824571,"This movie rules. It has humor, love, suspens...",Best Movie Ever!!!,5.0
2,A1127LKNR08JJK,0783222068,HA HA HA!!! THose were the first words anyone ...,Hilarious,5.0
3,A1127LKNR08JJK,0780022181,This is one of the best movies in the world. ...,This should get a 15,5.0
4,A1127LKNR08JJK,6303094880,This is so funny. It has a lot of good jokes ...,Awesome,5.0
...,...,...,...,...,...
99995,A9TARC4EV6EXK,B0000C2IVA,This wonderful near miss of a movie was terrib...,Ayanna will slice your soul,5.0
99996,A1ART5LIGLOVZO,B0000B1OC4,"Actually, it is extremely hard to find a good ...",A Very Biased Opinion of the Fast Food Industr...,3.0
99997,A1ZJBZYI14PR9B,B0009B16TE,The quality of the DVD and the series is excep...,Bewitched-The Complete First Season,5.0
99998,A1AVXIRWZ87MZM,0790731002,Forget about what everone elese said this film...,''WHO IS BATMAN?'',5.0


In [35]:
train_sample.to_pickle('./preprocessed/collaborative_input/train_sample.pkl', protocol = 4)
train_sample[['reviewText']].to_csv('./preprocessed/bert_input/train_sample.csv')

In [36]:
test_sample = test.loc[:test_sample_size-1,:]
test_sample.to_pickle('./preprocessed/collaborative_input/test_sample.pkl')

In [38]:
test_sample[['reviewText']].to_csv('./preprocessed/bert_input/test_sample.csv')

In [21]:
# The rest of this we will put in a new notebook

In [9]:
df.groupby(['reviewerID'])['overall'].count().describe()

count    122155.000000
mean         11.460849
std          36.461648
min           1.000000
25%           4.000000
50%           6.000000
75%           9.000000
max        2295.000000
Name: overall, dtype: float64

In [11]:
reviewer_count = train.groupby(['reviewerID'])['overall'].count()

In [12]:
reviewer_count = reviewer_count.reset_index().rename(columns={'overall':'reviewer_count'})

In [13]:
movie_count = train.groupby(['movieID'])['overall'].count()

In [14]:
movie_count = movie_count.reset_index().rename(columns={'overall':'movie_count'})
movie_count

Unnamed: 0,movieID,movie_count
0,0005019281,45
1,0005119367,54
2,0307141985,5
3,0307142469,26
4,0307142477,6
...,...,...
41880,B004EPYZO4,86
41881,B004EPYZOO,108
41882,B004EPYZP8,155
41883,B004EPYZPS,281


In [15]:
test = test.merge(reviewer_count,on='reviewerID',how='left')

In [16]:
test = test.merge(movie_count,on='movieID',how='left')
test['reviewer_count'] = test['reviewer_count'].fillna(0)
test['movie_count'] = test['movie_count'].fillna(0)
test

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,reviewer_count,movie_count
0,A3NPBGHQDKBB13,B0001J20QW,Cindy R,"[0, 0]",Great holiday movie. So glad to be able to pur...,5.0,Wonderful!,1357084800,2013-01-02,4.0,30.0
1,A3IXZJ1L73DYTA,B002GSXKQU,Jack E. Levic,"[1, 1]",I've reviewed the four films separately so I j...,4.0,SET OF 4 CLASSIC HOLIDAY FILMS,1357084800,2013-01-02,123.0,38.0
2,A2TUJ8W8UCIROQ,6300216748,chuck,"[0, 1]",Great spy thriller. has a good beginning that...,5.0,Great Movie,1357084800,2013-01-02,1.0,94.0
3,AZFQHNW5FLCMU,B000A3XY8C,Sue E. Devecka,"[0, 0]",I ordered this workout with my amazon.com gift...,5.0,Loved this workout!,1357084800,2013-01-02,2.0,23.0
4,A3UT71DE0NS65Y,0792147618,Otis L. Solomon,"[0, 1]",Great Great mivie in the Alex Cross series pla...,5.0,Kiss the Girls,1357084800,2013-01-02,4.0,66.0
...,...,...,...,...,...,...,...,...,...,...,...
402309,A2P23X9QF6OXUT,B000YQAMZI,"Rachel Dale ""Consider""","[0, 0]",I was in a Meryl Streep mood. I always like se...,4.0,An excellent performance,1405987200,2014-07-22,0.0,27.0
402310,A39UZORCGHK80U,B0000C0FFV,,"[0, 0]",I had been looking for this film for quite awh...,4.0,Thought it was one of Reynolds best fulms.,1405987200,2014-07-22,0.0,8.0
402311,A2HCF3U745WQAK,B000IJ79U2,,"[0, 0]",I got the whole set ( well... 3; I think that...,4.0,I got the whole set ( well... ...,1405987200,2014-07-22,0.0,8.0
402312,A336FCPK8ZVMYT,B000NDDU1K,"Gerald Williams ""booklistener""","[0, 0]",Old people like me need subtitles.,4.0,Four Stars,1405987200,2014-07-22,6.0,3.0
