In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import gzip
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
DATA_DIR = '../data'
VIDEO_GAME_FILE = 'Video_Games_5.json.gz'

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF(os.path.join(DATA_DIR, VIDEO_GAME_FILE))
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [4]:
df2 = df.dropna(subset=['vote'])
# print(df['vote'])
# print(type(df['vote'].values[2]))
print(df2.shape, df.shape)
df2.head()

(107793, 12) (497577, 12)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
11,1.0,False,"05 15, 2012",A1LMJ9W8UX1H5B,700026657,Rob NY,I've bought and played ALL of the ANNO games s...,SAY NO TO DRM!!,1337040000,28,,
12,5.0,True,"08 14, 2011",AN3YYDZAS3O1Y,700099867,Bob,Loved playing Dirt 2 and I thought the graphic...,A step up from Dirt 2 and that is terrific!,1313280000,11,,
15,4.0,True,"06 14, 2011",A361M14PU2GUEG,700099867,Angry Ryan,I had Dirt 2 on Xbox 360 and it was an okay ga...,DIRT 3,1308009600,2,,
16,5.0,False,"06 13, 2011",A2LQCBLLJVVR5T,700099867,Timmiley,This is a must have for any gamer. Codemasters...,BEST GRAPHICS OF ANY GAME SO FAR !!!!,1307923200,14,,
19,4.0,False,"12 26, 2012",A2GPRA9HHLOC4B,700026398,Wicasawakan,I will open with the Pros:\nReplayability - It...,Great game with design flaws,1356480000,12,,


# Drop Useless Columns

In [5]:
df3 = df2.reset_index().drop(columns=[
    'index','reviewTime','reviewerID', 'asin', 'reviewerName', 'unixReviewTime', 'style'])
df3.head()

Unnamed: 0,overall,verified,reviewText,summary,vote,image
0,1.0,False,I've bought and played ALL of the ANNO games s...,SAY NO TO DRM!!,28,
1,5.0,True,Loved playing Dirt 2 and I thought the graphic...,A step up from Dirt 2 and that is terrific!,11,
2,4.0,True,I had Dirt 2 on Xbox 360 and it was an okay ga...,DIRT 3,2,
3,5.0,False,This is a must have for any gamer. Codemasters...,BEST GRAPHICS OF ANY GAME SO FAR !!!!,14,
4,4.0,False,I will open with the Pros:\nReplayability - It...,Great game with design flaws,12,


In [6]:
df3['image'] = df['image'].apply(lambda d: len(d) if isinstance(d, list) else 0)
print(df3.shape)
df3.head()

(107793, 6)


Unnamed: 0,overall,verified,reviewText,summary,vote,image
0,1.0,False,I've bought and played ALL of the ANNO games s...,SAY NO TO DRM!!,28,0
1,5.0,True,Loved playing Dirt 2 and I thought the graphic...,A step up from Dirt 2 and that is terrific!,11,0
2,4.0,True,I had Dirt 2 on Xbox 360 and it was an okay ga...,DIRT 3,2,0
3,5.0,False,This is a must have for any gamer. Codemasters...,BEST GRAPHICS OF ANY GAME SO FAR !!!!,14,0
4,4.0,False,I will open with the Pros:\nReplayability - It...,Great game with design flaws,12,0


str

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
df3['reviewText'] = df3['reviewText'].fillna('')
df3['summary'] = df3['summary'].fillna('')

v_review = TfidfVectorizer(max_features=1000)
int_repr_review = v_review.fit_transform(df3['reviewText'])
int_df_review = pd.DataFrame(int_repr_review.toarray(), 
                             columns=["{}_tfidf_review".format(feat_name) for feat_name in v_review.get_feature_names()])

v_summary = TfidfVectorizer(max_features=1000)
int_repr_summary = v_summary.fit_transform(df3['summary'])
int_df_summary = pd.DataFrame(int_repr_summary.toarray(),
                              columns=["{}_tfidf_review".format(feat_name) for feat_name in v_summary.get_feature_names()])


final_df = pd.concat([df3, int_df_review, int_df_summary], axis=1).drop(columns=['reviewText', 'summary'])
final_df

Unnamed: 0,overall,verified,vote,image,10_tfidf_review,100_tfidf_review,12_tfidf_review,15_tfidf_review,20_tfidf_review,30_tfidf_review,...,yet_tfidf_review,you_tfidf_review,young_tfidf_review,younger_tfidf_review,your_tfidf_review,yourself_tfidf_review,zelda_tfidf_review,zero_tfidf_review,zombie_tfidf_review,zombies_tfidf_review
0,1.0,False,28,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,5.0,True,11,0,0.000000,0.185337,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,4.0,True,2,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,5.0,False,14,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,4.0,False,12,0,0.036464,0.046981,0.05144,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
5,1.0,True,7,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
6,1.0,True,9,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.134423,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
7,3.0,True,13,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
8,1.0,False,4,0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
9,3.0,True,7,0,0.000000,0.000000,0.00000,0.000000,0.074365,0.119050,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [19]:
final_df['verified'] = final_df['verified'].replace({True: 1, False: 0})
final_df['vote'] = final_df['vote'].apply(lambda s: s.replace(',', '')).astype('int64')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(final_df.drop(columns=['vote']), final_df['vote'], train_size=0.8)
X_train

Unnamed: 0,overall,verified,image,10_tfidf_review,100_tfidf_review,12_tfidf_review,15_tfidf_review,20_tfidf_review,30_tfidf_review,360_tfidf_review,...,yet_tfidf_review,you_tfidf_review,young_tfidf_review,younger_tfidf_review,your_tfidf_review,yourself_tfidf_review,zelda_tfidf_review,zero_tfidf_review,zombie_tfidf_review,zombies_tfidf_review
49272,4.0,0,0,0.034270,0.000000,0.000000,0.022483,0.019787,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
16169,5.0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
15362,1.0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
71251,4.0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
71092,5.0,1,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.099270,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
50257,5.0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
10669,5.0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
64564,5.0,1,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
101397,3.0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
74874,4.0,0,0,0.000000,0.034558,0.000000,0.000000,0.030973,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [22]:
reg = LinearRegression().fit(X_train, y_train)

print(reg.score(X_train, y_train))

0.0795251557204325


In [23]:
print(reg.score(X_test, y_test))

0.028750890647736393


array([[5., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [4., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [5., 1., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 0., 0.]])

In [13]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((86234, 2003), (21559, 2003), (86234,), (21559,))