In [1]:
import pandas as pd
import numpy as np
import gzip
import json

In [6]:
def parse(path, num_rows):
  g = gzip.open(path, 'rb')
  i = 0
  for l in g:
    yield json.loads(l)
    i += 1
    if i == num_rows:
      break

def getDF(path, num_rows):
  i = 0
  df = {}
  for d in parse(path, num_rows):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

#num_rows = 50000
df = getDF('./data/reviews_Movies_and_TV_5.json.gz', 2000000)

In [7]:
df['reviewText'] = df['reviewText'].str.replace('"','')
df['summary'] = df['summary'].str.replace('"','')

In [8]:
df = df.rename(columns={'asin':'movieID'})

In [9]:
df.reviewerID.value_counts()

A3LZGLA88K0LA0    2368
A16CZRQL23NOIW    2333
ANCOMAI0I7LVG     2322
ABO2ZI2Y5DQ9T     2301
A328S9RN3U5M68    2267
                  ... 
A3O5LOR74EPN53       5
A33CZ0M2GF9C7L       5
A14XAK6LATEGCJ       5
A27JVNK885JP43       5
A1ZC83XEZ7BERW       5
Name: reviewerID, Length: 123960, dtype: int64

In [10]:
df

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ADZPIG9QOCDG5,0005019281,"Alice L. Larson ""alice-loves-books""","[0, 0]",This is a charming version of the classic Dick...,4.0,good version of a classic,1203984000,"02 26, 2008"
1,A35947ZP82G7JH,0005019281,Amarah Strack,"[0, 0]",It was good but not as emotionally moving as t...,3.0,Good but not as moving,1388361600,"12 30, 2013"
2,A3UORV8A9D5L2E,0005019281,Amazon Customer,"[0, 0]","Don't get me wrong, Winkler is a wonderful cha...",3.0,Winkler's Performance was ok at best!,1388361600,"12 30, 2013"
3,A1VKW06X1O2X7V,0005019281,"Amazon Customer ""Softmill""","[0, 0]",Henry Winkler is very good in this twist on th...,5.0,It's an enjoyable twist on the classic story,1202860800,"02 13, 2008"
4,A3R27T4HADWFFJ,0005019281,BABE,"[0, 0]",This is one of the best Scrooge movies out. H...,4.0,Best Scrooge yet,1387670400,"12 22, 2013"
...,...,...,...,...,...,...,...,...,...
1697528,AV657BUYHHXZ2,B00LT1JHLW,"Mike Rules ""Mike""","[1, 14]",wow $269.99 for the entire series on Blu Ray??...,1.0,Way to Expensive!! WB = GREED,1406073600,"07 23, 2014"
1697529,A17W587EH23J0Q,B00LT1JHLW,"Ron2900 ""Ron""","[32, 48]","Finally, the holy grail of tv-on-dvd boxsets i...",5.0,"HOLY BAT-BOXSET, BATMAN... I never thought thi...",1405641600,"07 18, 2014"
1697530,A3DE438TF1A958,B00LT1JHLW,thomas henry,"[3, 10]",Could this be a true or I'm i dreaming batman ...,5.0,prayers have been answered because batman 60s ...,1405728000,"07 19, 2014"
1697531,A2RWCXDMANY0LW,B00LT1JHLW,wheev,"[0, 4]",I've been a fan of the series since I was a yo...,5.0,can't Wait!,1405987200,"07 22, 2014"


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1697533 entries, 0 to 1697532
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   reviewerID      object 
 1   movieID         object 
 2   reviewerName    object 
 3   helpful         object 
 4   reviewText      object 
 5   overall         float64
 6   summary         object 
 7   unixReviewTime  int64  
 8   reviewTime      object 
dtypes: float64(1), int64(1), object(7)
memory usage: 129.5+ MB


In [12]:
df = df.sort_values(by='unixReviewTime')

In [13]:
split_time = 1300000000
df_split = df[df['unixReviewTime'] < split_time] 

In [14]:
len(df_split)

826627

In [96]:
train_df = df_split.sample(200000, random_state=42)

In [97]:
test_df = df[df['unixReviewTime'] > split_time] 

In [98]:
len(test_df)

870906

In [99]:
train_df.reviewerID.value_counts()

A16CZRQL23NOIW    532
A2NJO6YE954DBH    482
A3LZGLA88K0LA0    479
A328S9RN3U5M68    406
ABO2ZI2Y5DQ9T     385
                 ... 
A2O0NBU7ZCUF7C      1
ATS2855497V0I       1
A2SGJYNBVIJ1EA      1
ASN79GBT3QDYJ       1
A12A4I4X5SCZPM      1
Name: reviewerID, Length: 47796, dtype: int64

In [100]:
train_df = train_df.groupby(['reviewerID']).filter(lambda x: len(x) >= 5)
train_df = train_df.groupby(['movieID']).filter(lambda x: len(x) >= 5)

In [101]:
len(train_df)

88116

In [102]:
train_reviewers = train_df.reviewerID.unique()

In [103]:
test_df = test_df[test_df.reviewerID.isin(train_reviewers)]

In [104]:
test_df.reviewerID.value_counts()

AV6QDP8Q0ONK4     1660
ANCOMAI0I7LVG     1007
A29TKSIWA3JKF3     842
A1XT8AJB7S9JJG     835
A27H9DOUGY9FOS     754
                  ... 
A1DFQCYOKQ64CU       1
A35K84RT618N3A       1
A5CWLQF6QO3CN        1
A1WL2EKE4TAEJP       1
AR37967IAR6ZT        1
Name: reviewerID, Length: 3963, dtype: int64

In [105]:
train_movies = train_df.movieID.unique()
len(train_movies)

6692

In [106]:
test_df = test_df[test_df.movieID.isin(train_movies)]

In [107]:
len(test_df)

24713

In [108]:
train_sample = train_df[['reviewerID','movieID','reviewText','summary','overall']]
test_sample = test_df[['reviewerID','movieID','reviewText','summary','overall']]

In [109]:
train_sample['reviewText'] = train_sample['reviewText'].str.replace('"','')
train_sample['summary'] = train_sample['summary'].str.replace('"','')

In [110]:
test_sample['reviewText'] = test_sample['reviewText'].str.replace('"','')
test_sample['summary'] = test_sample['summary'].str.replace('"','')

In [111]:
train_sample.to_pickle('./preprocessed/collaborative_input/new_train_sample.pkl', protocol = 4)
train_sample[['reviewText']].to_csv('./preprocessed/bert_input/new_train_sample.csv')

In [112]:
test_sample.to_pickle('./preprocessed/collaborative_input/new_test_sample.pkl', protocol = 4)
test_sample[['reviewText']].to_csv('./preprocessed/bert_input/new_test_sample.csv')

In [113]:
test_sample

Unnamed: 0,reviewerID,movieID,reviewText,summary,overall
646479,A3PG0F1KA7ZT3O,B00005JP97,The rhythm of the Blues is birthed down and di...,I'm talkin' about the Blues... Son House,5.0
432444,A2GPN2VKS8E77C,6304696515,This adaptation of the novel Les Liaisons Dang...,Sex Is Best Used As A Deadly Weapon. Excellent...,5.0
1297267,A2MIO2RW0JE0C6,B002WNUVJS,Compelling storytelling... Beautiful art direc...,Marvel Animated raises its game with PLANET HULK,5.0
1390010,A3A4ZAIBQWKOZS,B0049P1VHS,"T obe clear, this is a review ofThe Walking De...",Great release for a great series,5.0
226942,A106016KSI0YQ,6300247651,This rich little horror film about a single mo...,All in her head?,4.0
...,...,...,...,...,...
1317496,A2QVQB7D93XIPA,B0031RAOVY,"A touching tale of love, devotion through the ...","Love, loyalty and devotion",5.0
90991,A3G2OSOP4XKFZL,0790729644,This is a truly great movie. When ever I sens...,They have a sign on him,5.0
8564,A3O536E2YOKK4Y,0767807588,"Released in 1993, Bruce Willis' &#34;Striking ...",Comic book cop thriller in Pittsburgh,4.0
876721,A3MV1KKHX51FYT,B00074DY16,"Return to Peyton Place, 1961 filmAllison MacKe...",Publicizing a Small Town,4.0


In [114]:
train_sample

Unnamed: 0,reviewerID,movieID,reviewText,summary,overall
171125,AER15RIMV8E6D,1415724784,This was pretty boring. Listening to the guy w...,An Inconvenient Bore!!!,2.0
407185,A1D6L5H76KMV6P,630420065X,"A great vision of the old versions, comes with...",the First Mission to get crazy!!,5.0
808445,A2RRO07OFPV06B,B0001JXOUS,A civil war Captain travels to Japan to help t...,SUPERB ENTERTAINMENT,5.0
364763,AL7LEBKKW8XCC,6303213650,"Because of their sucess with Fargo, people ten...",forgotten coen brothers,5.0
5562,A355WMP0CQBJFX,0767802497,who hasn't seen this movie. there's no need to...,BROUGHT SOME MAD FEAR OF ANACONDAS!!!,4.0
...,...,...,...,...,...
929949,A1Q15TP2FXYYMF,B000AOEPU2,I truly mean the title and here is why. Bret H...,The Exelence Of Execution Has A DVD Finally,5.0
907541,ADTUS88NFTW23,B0009KA7BS,My introduction to Alan Moore's John Constanti...,"Worth watching, but didn't blow me away.",3.0
1119076,A9RNMO9MUSMTJ,B000YDOOEQ,Its like 2 movies in one....a fairy tale and a...,Solid 3 stars,3.0
1338775,A2OZBJ58CML9OS,B003EV6DBM,"OK, it's a little premature to judge something...",Brilliant revision of a beloved series,5.0
