In [1]:
import pandas as pd
import numpy as np
from scipy import sparse as sps
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [2]:
import lenskit
import lenskit.crossfold as xf
from  lenskit.crossfold import TTPair

In [3]:
from lenskit.algorithms import als, basic, item_knn, user_knn
from lenskit.algorithms.basic import Fallback
from lenskit.algorithms.als import BiasedMF, ImplicitMF
from lenskit.algorithms.implicit import BPR

In [4]:
from lenskit.batch import MultiEval
from lenskit.crossfold import partition_users, SampleN
from lenskit import batch, topn, util
from tf_idf import tf_idf
from LDA import LDA

In [5]:
saved = open("pickle/game_reviews.pickle","rb")
game_reviews = pickle.load(saved)
game_reviews.head()

Unnamed: 0,funny,helpful,item_id,last_edited,posted,recommend,review,user_id
0,,No ratings yet,1250,,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...,76561197970982479
1,,No ratings yet,22200,,"Posted July 15, 2011.",True,It's unique and worth a playthrough.,76561197970982479
2,,No ratings yet,43110,,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
3,,15 of 20 people (75%) found this review helpful,251610,,"Posted June 24, 2014.",True,I know what you think when you see this title ...,js41637
4,,0 of 1 people (0%) found this review helpful,227300,,"Posted September 8, 2013.",True,For a simple (it's actually not all that simpl...,js41637


In [6]:
user_game = open("pickle/user_games.pickle","rb")
user_games = pickle.load(user_game)
user_games.head()

Unnamed: 0,user_id,item_id,item_name
0,76561197970982479,10,Counter-Strike
1,76561197970982479,20,Team Fortress Classic
2,76561197970982479,30,Day of Defeat
3,76561197970982479,40,Deathmatch Classic
4,76561197970982479,50,Half-Life: Opposing Force


In [7]:
user_games = user_games.rename(columns={'user_id': 'user', 'item_id': 'item'})
user_games_list = user_games[['item', 'user']]
user_games_list.head()

Unnamed: 0,item,user
0,10,76561197970982479
1,20,76561197970982479
2,30,76561197970982479
3,40,76561197970982479
4,50,76561197970982479


In [8]:
reviews = game_reviews[['item_id', 'user_id','review']]
reviews = reviews.rename(columns={'user_id': 'user', 'item_id': 'item'})
reviews.shape

(58430, 3)

In [9]:
def groupby_count(df, group, count):
    game_count = pd.DataFrame()
    game_count['count'] = df.groupby(group)[count].nunique()
    return game_count

In [10]:
def prune(df, condition):     ## returns a dataframe that meet the given condition
    user_n = df.loc[df['count'] < condition ]
    return user_n

In [11]:
game_count = groupby_count(reviews, 'user', 'item')
game_count.sort_values(by = 'count', ascending = False).head()

Unnamed: 0_level_0,count
user,Unnamed: 1_level_1
76561198045431856,10
chicken_tonight,10
76561198048348337,10
registeredso,10
ItsDerRey,10


In [12]:
user_5 = prune(game_count, 5)
user_less_5 = user_5.index
pruned_data_5 = reviews.set_index('user').drop(user_less_5)
pruned_data_5.reset_index(inplace = True)
pruned_data_5.head()

Unnamed: 0,user,item,review
0,evcentric,248820,A suitably punishing roguelike platformer. Wi...
1,evcentric,370360,"""Run for fun? What the hell kind of fun is that?"""
2,evcentric,237930,"Elegant integration of gameplay, story, world ..."
3,evcentric,263360,"Random drops and random quests, with stat poin..."
4,evcentric,107200,Fun balance of tactics and strategy. Potentia...


In [13]:
tf = tf_idf()
tf.fit(pruned_data_5)

<tf_idf.tf_idf at 0x7fc12cd16da0>

In [14]:
tf.predict_for_user('evcentric',[10,20,30,40,50,60])

item
10    0.653364
20    0.323444
30    0.404006
50    0.291376
60    0.243594
dtype: float64

In [15]:
lda_obj = LDA()
lda_obj.fit(pruned_data_5)

<LDA.LDA at 0x7fc10189b198>

In [16]:
lda_obj.similarity_matrix

array([[0.40978079, 0.34648892, 0.38399283, ..., 0.04943473, 0.09471961,
        0.        ],
       [0.34648892, 0.80512749, 0.88048583, ..., 0.04972393, 0.21718967,
        0.        ],
       [0.38399283, 0.88048583, 0.97578947, ..., 0.04939103, 0.24069825,
        0.        ],
       ...,
       [0.04943473, 0.04972393, 0.04939103, ..., 0.05      , 0.04978469,
        0.0488125 ],
       [0.09471961, 0.21718967, 0.24069825, ..., 0.04978469, 0.62491902,
        0.7341672 ],
       [0.        , 0.        , 0.        , ..., 0.0488125 , 0.7341672 ,
        0.95306405]])

In [17]:
lda_obj.predict_for_user('evcentric',[10,20,30,40,50,60])

item
10    0.344370
20    0.789631
30    0.875100
50    0.000000
60    0.278986
dtype: float64

In [18]:
tf.similarity_matrix

array([[1.        , 0.1499391 , 0.13796804, ..., 0.        , 0.117523  ,
        0.07885343],
       [0.1499391 , 1.        , 0.06882521, ..., 0.        , 0.07033395,
        0.04977039],
       [0.13796804, 0.06882521, 1.        , ..., 0.        , 0.04951468,
        0.04510642],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.117523  , 0.07033395, 0.04951468, ..., 0.        , 1.        ,
        0.06608514],
       [0.07885343, 0.04977039, 0.04510642, ..., 0.        , 0.06608514,
        1.        ]])