In [1]:
import pandas as pd
import numpy as np
import gzip
import json

In [6]:
def parse(path, num_rows):
  g = gzip.open(path, 'rb')
  i = 0
  for l in g:
    yield json.loads(l)
    i += 1
    if i == num_rows:
      break

def getDF(path, num_rows):
  i = 0
  df = {}
  for d in parse(path, num_rows):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

#num_rows = 50000
df = getDF('./data/reviews_Movies_and_TV_5.json.gz', 2000000)

In [7]:
df['reviewText'] = df['reviewText'].str.replace('"','')
df['summary'] = df['summary'].str.replace('"','')

In [8]:
df = df.rename(columns={'asin':'movieID'})

In [9]:
df.reviewerID.value_counts()

A3LZGLA88K0LA0    2368
A16CZRQL23NOIW    2333
ANCOMAI0I7LVG     2322
ABO2ZI2Y5DQ9T     2301
A328S9RN3U5M68    2267
                  ... 
A3O5LOR74EPN53       5
A33CZ0M2GF9C7L       5
A14XAK6LATEGCJ       5
A27JVNK885JP43       5
A1ZC83XEZ7BERW       5
Name: reviewerID, Length: 123960, dtype: int64

In [10]:
df

Unnamed: 0,reviewerID,movieID,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ADZPIG9QOCDG5,0005019281,"Alice L. Larson ""alice-loves-books""","[0, 0]",This is a charming version of the classic Dick...,4.0,good version of a classic,1203984000,"02 26, 2008"
1,A35947ZP82G7JH,0005019281,Amarah Strack,"[0, 0]",It was good but not as emotionally moving as t...,3.0,Good but not as moving,1388361600,"12 30, 2013"
2,A3UORV8A9D5L2E,0005019281,Amazon Customer,"[0, 0]","Don't get me wrong, Winkler is a wonderful cha...",3.0,Winkler's Performance was ok at best!,1388361600,"12 30, 2013"
3,A1VKW06X1O2X7V,0005019281,"Amazon Customer ""Softmill""","[0, 0]",Henry Winkler is very good in this twist on th...,5.0,It's an enjoyable twist on the classic story,1202860800,"02 13, 2008"
4,A3R27T4HADWFFJ,0005019281,BABE,"[0, 0]",This is one of the best Scrooge movies out. H...,4.0,Best Scrooge yet,1387670400,"12 22, 2013"
...,...,...,...,...,...,...,...,...,...
1697528,AV657BUYHHXZ2,B00LT1JHLW,"Mike Rules ""Mike""","[1, 14]",wow $269.99 for the entire series on Blu Ray??...,1.0,Way to Expensive!! WB = GREED,1406073600,"07 23, 2014"
1697529,A17W587EH23J0Q,B00LT1JHLW,"Ron2900 ""Ron""","[32, 48]","Finally, the holy grail of tv-on-dvd boxsets i...",5.0,"HOLY BAT-BOXSET, BATMAN... I never thought thi...",1405641600,"07 18, 2014"
1697530,A3DE438TF1A958,B00LT1JHLW,thomas henry,"[3, 10]",Could this be a true or I'm i dreaming batman ...,5.0,prayers have been answered because batman 60s ...,1405728000,"07 19, 2014"
1697531,A2RWCXDMANY0LW,B00LT1JHLW,wheev,"[0, 4]",I've been a fan of the series since I was a yo...,5.0,can't Wait!,1405987200,"07 22, 2014"


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1697533 entries, 0 to 1697532
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   reviewerID      object 
 1   movieID         object 
 2   reviewerName    object 
 3   helpful         object 
 4   reviewText      object 
 5   overall         float64
 6   summary         object 
 7   unixReviewTime  int64  
 8   reviewTime      object 
dtypes: float64(1), int64(1), object(7)
memory usage: 129.5+ MB


In [12]:
df = df.sort_values(by='unixReviewTime')

In [13]:
split_time = 1300000000
df_split = df[df['unixReviewTime'] < split_time] 

In [14]:
len(df_split)

826627

In [51]:
train_df = df_split.sample(200000)

In [52]:
test_df = df[df['unixReviewTime'] > split_time] 

In [53]:
len(test_df)

870906

In [54]:
train_df.reviewerID.value_counts()

A16CZRQL23NOIW    489
A2NJO6YE954DBH    478
A3LZGLA88K0LA0    445
ABO2ZI2Y5DQ9T     422
A2EDZH51XHFA9B    409
                 ... 
A2Z4XQNJR40SSP      1
A244NKEE6EBXB2      1
ANUSBCN463G1M       1
A39476Z41H9VK3      1
A3KJS0TGYIX2O1      1
Name: reviewerID, Length: 47714, dtype: int64

In [55]:
train_df = train_df.groupby(['reviewerID']).filter(lambda x: len(x) >= 5)

In [63]:
len(train_df)

126349

In [57]:
train_reviewers = train_df.reviewerID.unique()

In [58]:
test_df = test_df[test_df.reviewerID.isin(train_reviewers)]

In [59]:
test_df.reviewerID.value_counts()

AV6QDP8Q0ONK4     1660
ANCOMAI0I7LVG     1007
A29TKSIWA3JKF3     842
A1XT8AJB7S9JJG     835
A27H9DOUGY9FOS     754
                  ... 
ATSQHC64IC8JI        1
AI0PV0FQLGEAY        1
A7YJTD4YOV9GN        1
A2QWXHB5IU8LCU       1
A2VSTCKMUB46LL       1
Name: reviewerID, Length: 4098, dtype: int64

In [60]:
train_movies = train_df.movieID.unique()
len(train_movies)

27828

In [61]:
test_df = test_df[test_df.movieID.isin(train_movies)]

In [64]:
len(test_df)

44356

In [65]:
train_sample = train_df[['reviewerID','movieID','reviewText','summary','overall']]
test_sample = test_df[['reviewerID','movieID','reviewText','summary','overall']]

In [69]:
train_sample['reviewText'] = train_sample['reviewText'].str.replace('"','')
train_sample['summary'] = train_sample['summary'].str.replace('"','')

In [70]:
test_sample['reviewText'] = test_sample['reviewText'].str.replace('"','')
test_sample['summary'] = test_sample['summary'].str.replace('"','')

In [71]:
train_sample.to_pickle('./preprocessed/collaborative_input/new_train_sample.pkl', protocol = 4)
train_sample[['reviewText']].to_csv('./preprocessed/bert_input/new_train_sample.csv')

In [72]:
test_sample.to_pickle('./preprocessed/collaborative_input/new_test_sample.pkl', protocol = 4)
test_sample[['reviewText']].to_csv('./preprocessed/bert_input/new_test_sample.csv')

In [73]:
test_sample

Unnamed: 0,reviewerID,movieID,reviewText,summary,overall
1404443,A2PR6NXG0PA3KY,B004EQAVA0,"... about this film, or some profound objectio...",I Wish I Had Something Witty to Write ...,4.0
34113,AZ2YLC0GWTAH2,0780625390,Thanks to a friend of mine finally cleaning ou...,The Debut Year Of The Man!,5.0
432444,A2GPN2VKS8E77C,6304696515,This adaptation of the novel Les Liaisons Dang...,Sex Is Best Used As A Deadly Weapon. Excellent...,5.0
1390010,A3A4ZAIBQWKOZS,B0049P1VHS,"T obe clear, this is a review ofThe Walking De...",Great release for a great series,5.0
1405314,A1YLX64YW58CVL,B004FUKLK0,"Really, what more could you ask for? The Les P...",As Good as it Gets!,5.0
...,...,...,...,...,...
16085,A3GGMPRYCQ87CY,0767821513,I watched this over the weekend. I remember ha...,Pretty Bad,2.0
469248,A6VFFRIHAXKRL,6305565724,"Hello out there & welcome to this, this is a r...",WWF Fully Loaded 1999 review 8.5/10,4.0
876721,A3MV1KKHX51FYT,B00074DY16,"Return to Peyton Place, 1961 filmAllison MacKe...",Publicizing a Small Town,4.0
1108225,AQ01Q3070LT29,B000WC39KO,American Pie Presents: Beta House (2007). Unra...,Education or a beta way of life? Lots of sex.,3.0


In [74]:
train_sample

Unnamed: 0,reviewerID,movieID,reviewText,summary,overall
710147,A153MVFR0FTI6O,B00006AUH9,With all of the BORING shows on TV which could...,"NBC (among others), pay attention to what real...",5.0
56502,A4Z72X5C24X2K,0783226799,A classic but demand a widescreen version.,Where's the widescreen version?,5.0
544971,A2LCGB0APDZM65,B00003ETQN,Loved it! All the great sexy videos on a sing...,Well worth it (with a few MINOR reservations),5.0
832687,A1PPIFR51T81IK,B0002F6BI8,I have long supported the B category film noir...,One Great Film,4.0
171209,AMBFK6N4JS5Z2,1415724784,This video made me believe in Global Warning.U...,"I used to believe it, now I'm not sure.",3.0
...,...,...,...,...,...
1036027,A1IWWRZJOXOSYP,B000KX0HH8,This film promised lots of action. It deliver...,Trail of Bodies,3.0
460229,A26BWRBPP4V2WF,6305326940,I know there is a generation of kids out there...,A great DVD edition of one of Pixar's lesser o...,3.0
1095518,A8SCX6VUTE05H,B000UNYJWM,"Hmmmm, this movie looks very familiar. It rese...",Who's your Caddyshack?,1.0
783497,A18758S1PUYIDT,B0000TPA6K,"This is not your typical teen angst show, thou...",Humor and adventure mixed with angst and struggle,5.0
