In [5]:
import pandas as pd
import numpy as np
from scipy import sparse as sps
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
import lenskit
import lenskit.crossfold as xf
from  lenskit.crossfold import TTPair

In [7]:
from lenskit.algorithms import als, basic, item_knn, user_knn
from lenskit.algorithms.basic import Fallback
from lenskit.algorithms.als import BiasedMF, ImplicitMF
from lenskit.algorithms.implicit import BPR

In [8]:
from lenskit.batch import MultiEval
from lenskit.crossfold import partition_users, SampleN
from lenskit import batch, topn, util
from tf_idf import tf_idf

In [9]:
saved = open("pickle/game_reviews.pickle","rb")
game_reviews = pickle.load(saved)
game_reviews.head()

Unnamed: 0,funny,helpful,item_id,last_edited,posted,recommend,review,user_id
0,,No ratings yet,1250,,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...,76561197970982479
1,,No ratings yet,22200,,"Posted July 15, 2011.",True,It's unique and worth a playthrough.,76561197970982479
2,,No ratings yet,43110,,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
3,,15 of 20 people (75%) found this review helpful,251610,,"Posted June 24, 2014.",True,I know what you think when you see this title ...,js41637
4,,0 of 1 people (0%) found this review helpful,227300,,"Posted September 8, 2013.",True,For a simple (it's actually not all that simpl...,js41637


In [10]:
game_reviews.head()

Unnamed: 0,funny,helpful,item_id,last_edited,posted,recommend,review,user_id
0,,No ratings yet,1250,,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...,76561197970982479
1,,No ratings yet,22200,,"Posted July 15, 2011.",True,It's unique and worth a playthrough.,76561197970982479
2,,No ratings yet,43110,,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
3,,15 of 20 people (75%) found this review helpful,251610,,"Posted June 24, 2014.",True,I know what you think when you see this title ...,js41637
4,,0 of 1 people (0%) found this review helpful,227300,,"Posted September 8, 2013.",True,For a simple (it's actually not all that simpl...,js41637


In [11]:
#game_reviews['review'] = game_reviews['review'].applymap(str)
#game_reviews['review'] = game_reviews['review'].astype(str)

In [12]:
user_game = open("pickle/user_games.pickle","rb")
user_games = pickle.load(user_game)
user_games.head()

Unnamed: 0,user_id,item_id,item_name
0,76561197970982479,10,Counter-Strike
1,76561197970982479,20,Team Fortress Classic
2,76561197970982479,30,Day of Defeat
3,76561197970982479,40,Deathmatch Classic
4,76561197970982479,50,Half-Life: Opposing Force


In [13]:
user_games = user_games.rename(columns={'user_id': 'user', 'item_id': 'item'})
user_games_list = user_games[['item', 'user']]
user_games_list.head()

Unnamed: 0,item,user
0,10,76561197970982479
1,20,76561197970982479
2,30,76561197970982479
3,40,76561197970982479
4,50,76561197970982479


In [14]:
reviews = game_reviews[['item_id', 'user_id','review']]
reviews = reviews.rename(columns={'user_id': 'user', 'item_id': 'item'})
reviews.shape

(58430, 3)

In [15]:
reviews.tail()

Unnamed: 0,item,user,review
58425,70,76561198312638244,a must have classic from steam definitely wort...
58426,362890,76561198312638244,this game is a perfect remake of the original ...
58427,273110,LydiaMorley,had so much fun plaing this and collecting res...
58428,730,LydiaMorley,:D
58429,440,LydiaMorley,so much fun :D


### join user-item and reviws

In [16]:
rev_item = set(reviews['item'])
user_item = set(user_games['item'])
item_butNot_rev = user_item.intersection(rev_item) ## items that have reviews

### taking only reviwed item

In [17]:
user_item_rev = user_games_list[user_games_list['item'].isin(list(item_butNot_rev))] 
user_item_rev.head()

Unnamed: 0,item,user
0,10,76561197970982479
1,20,76561197970982479
2,30,76561197970982479
3,40,76561197970982479
4,50,76561197970982479


In [18]:
result = pd.merge(user_item_rev, reviews, how = 'outer', on=['item', 'user'])
result.shape

(4219381, 3)


# Analysis

### so every item in result data has reviews but not every user is a reviwer

In [19]:
result.nunique()

item       3682
user      71856
review    55312
dtype: int64

In [20]:
reviews.nunique()

item       3682
user      25457
review    55312
dtype: int64

## Pruning

## removing less than 5 items holder

### item count and remove user less than (n)

In [21]:
def groupby_count(df, group, count):
    game_count = pd.DataFrame()
    game_count['count'] = df.groupby(group)[count].count()
    return game_count

In [22]:
def prune(df, condition):     ## returns a dataframe that meet the given condition
    user_n = df.loc[df['count'] < condition ]
    return user_n

In [23]:
game_count = groupby_count(result, 'user', 'item')
game_count.sort_values(by = 'count', ascending = False).head()

Unnamed: 0_level_0,count
user,Unnamed: 1_level_1
phrostb,2698
chidvd,2514
piepai,2419
thugnificent,2373
DeEggMeister,1914


In [24]:
user_5 = prune(game_count, 5)

In [25]:
user_less_5 = user_5.index
user_less_5

Index(['001002130882', '00284702', '010195345', '0102705195',
       '011111135489484797', '0132489', '025652', '03375616256', '0445233',
       '07824',
       ...
       'zellenal', 'zeronacho', 'zielinskak40', 'ziko21', 'zillarino',
       'zimbalor', 'zixwot', 'zombifiedddd', 'zraicis', 'zzonci'],
      dtype='object', name='user', length=9833)

In [26]:
pruned_data_5 = result.set_index('user').drop(user_less_5)
pruned_data_5.reset_index(inplace = True)
pruned_data_5.head()

Unnamed: 0,user,item,review
0,76561197970982479,10,
1,76561197970982479,20,
2,76561197970982479,30,
3,76561197970982479,40,
4,76561197970982479,50,


## Analysis

#### 13% of users are missed

In [27]:
#len(prune(game_count, 2))  ## users that have 1 items in library
pruned_user_len = len(prune(game_count, 5))/result['user'].nunique()
pruned_user_len

0.13684313070585616

#### .5% items are missed

In [28]:
reviews.nunique()

item       3682
user      25457
review    55312
dtype: int64

In [29]:
pruned_data_5.nunique()

user      62023
item       3663
review    51269
dtype: int64

In [30]:
pruned_item = result['item'].nunique()-pruned_data_5['item'].nunique()
pruned_item_frac = pruned_item/result['item'].nunique()
pruned_item_frac

0.005160239000543183

#### 7% reviews are missed

In [31]:
pruned_rev = result['review'].nunique()-pruned_data_5['review'].nunique()
pruned_rev_frac = pruned_rev/result['review'].nunique()
pruned_rev_frac

0.07309444605148974

## check user item

In [32]:
pruned_data_5.loc[pruned_data_5['user'] == 'kenkaniff']

Unnamed: 0,user,item,review
4196366,kenkaniff,42680,This is honestly the worst Call of Duty ever. ...
4196367,kenkaniff,242050,Great game! Recommend it!
4196368,kenkaniff,209160,Not that great of a game actually! The multipl...
4196369,kenkaniff,8190,This is one of my favourite games! It is reall...
4196370,kenkaniff,24240,"Love it, can't stop playing it!!"


In [33]:
only_rev = pruned_data_5.dropna()

In [34]:
item_data1 = pd.DataFrame({'review': only_rev.groupby(['item']).review.apply(lambda x:' '.join(x))})
item_data1.reset_index(inplace=True)

## Partition by user

In [37]:
pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(1)))
truth = pd.concat((p.test for p in pairs_user))
#truth.to_csv(r'results/steam/pruned_5.csv')

In [72]:
a = truth.sort_values(by='user')
b = a.reset_index()
b.loc[b['user'] == '76561198052603468']

Unnamed: 0,index,user,item,review
13079,1638399,76561198052603468,234650,


In [71]:
b.iloc[10525]

index               3526656
user      76561198045752311
item                 273350
review                  NaN
Name: 10525, dtype: object

In [53]:
a.shape

(62023, 4)

## Evaluation

In [27]:
def algo_eval(path, algo, dataset):
    evaluation = batch.MultiEval(path=path, predict=False, recommend=True)
    evaluation.add_algorithms(algos=algo)
    evaluation.add_datasets(data=dataset)
    evaluation.run()

In [28]:
def ndcg(file_name, truth):
    recs = pd.read_parquet(file_name)
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    ndcg = rla.compute(recs, truth)
    return ndcg

## fit

In [29]:
algo_ii = item_knn.ItemItem(20, center=False, aggregate='sum')
#algo_uu = user_knn.UserUser(30, center=False, aggregate='sum')
algo_pop = basic.Popular()
algo_mf = ImplicitMF(40)
algo_bpr = BPR()
algo_tf_idf = tf_idf()

In [43]:
%%time
#algo_eval('results/steam/pruned_5', [algo_ii, algo_pop,algo_mf,algo_bpr], pairs_user)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [56]:
legend = pd.read_csv("results/compare/time/runs.csv")
legend = legend.set_index('RunId').loc[:,'AlgoStr']
legend

RunId
1                 ItemItem(nnbrs=20, msize=None)
2     als.ImplicitMF(features=40, reg=0.1, w=40)
3                 ItemItem(nnbrs=20, msize=None)
4     als.ImplicitMF(features=40, reg=0.1, w=40)
5                 ItemItem(nnbrs=20, msize=None)
6     als.ImplicitMF(features=40, reg=0.1, w=40)
7                 ItemItem(nnbrs=20, msize=None)
8     als.ImplicitMF(features=40, reg=0.1, w=40)
9                 ItemItem(nnbrs=20, msize=None)
10    als.ImplicitMF(features=40, reg=0.1, w=40)
Name: AlgoStr, dtype: object

In [57]:
ndcg_algo = ndcg_algo.join(legend, on='RunId')

In [58]:
ndcg_algo.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,ndcg,AlgoStr
user,RunId,Unnamed: 2_level_1,Unnamed: 3_level_1
-2SV-vuLB-Kg,1,0.030712,"ItemItem(nnbrs=20, msize=None)"
-2SV-vuLB-Kg,2,0.109971,"als.ImplicitMF(features=40, reg=0.1, w=40)"
-2SV-vuLB-Kg,5,0.044042,"ItemItem(nnbrs=20, msize=None)"
-SEVEN-,1,0.02538,"ItemItem(nnbrs=20, msize=None)"
-SEVEN-,2,0.033049,"als.ImplicitMF(features=40, reg=0.1, w=40)"
-SEVEN-,3,0.058356,"ItemItem(nnbrs=20, msize=None)"
-SEVEN-,5,0.142531,"ItemItem(nnbrs=20, msize=None)"
-_PussyDestroyer_-,1,0.079238,"ItemItem(nnbrs=20, msize=None)"
-_PussyDestroyer_-,4,0.044287,"als.ImplicitMF(features=40, reg=0.1, w=40)"
-_PussyDestroyer_-,5,0.086064,"ItemItem(nnbrs=20, msize=None)"


## Partition_user

In [26]:
pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(5)))
truth = pd.concat((p.test for p in pairs_user), ignore_index=True)
#truth.to_csv(r'results/steam/truth_user.csv')

In [24]:
eval = MultiEval('my-eval', predict=False, recommend=True)
eval.add_datasets(pairs_user, name='steam')
eval.add_algorithms(tf_idf(), name='tf_idf')
eval.run()

  inferred_dtype = infer_dtype(column)
  result = infer_dtype(pandas_collection)
  inferred_dtype = infer_dtype(column)
  result = infer_dtype(pandas_collection)
  inferred_dtype = infer_dtype(column)
  result = infer_dtype(pandas_collection)
  inferred_dtype = infer_dtype(column)
  result = infer_dtype(pandas_collection)
  inferred_dtype = infer_dtype(column)
  result = infer_dtype(pandas_collection)


In [25]:
ndcg_user = ndcg('my-eval/recommendations.parquet', truth)

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


In [26]:
ndcg_user

Unnamed: 0_level_0,Unnamed: 1_level_0,ndcg
user,RunId,Unnamed: 2_level_1
-2SV-vuLB-Kg,3,0.122735
-SEVEN-,1,0.217110
-_PussyDestroyer_-,5,0.168011
00000000000000000001227,4,0.254280
01189958889189157253,5,0.343112
04061993,4,0.205600
08254669696969696969,4,0.185234
091263,4,0.442134
09879655452567,3,0.316315
10051997,1,0.292997


## Test

In [131]:
for i, tp in enumerate(xf.partition_users(pruned_data, 5, xf.SampleN(3))):
    tp.train.to_csv('steam/train_pruned-%d.csv' % (i,))
    tp.train.to_parquet('steam/train_pruned-%d.parquet' % (i,))
    tp.test.to_csv('steam/test_pruned-%d.csv' % (i,))
    tp.test.to_parquet('steam/test_pruned-%d.parquet' % (i,))

  result = infer_dtype(pandas_collection)


In [50]:
tt_tuples =[]
for i, tp in enumerate(xf.partition_users(pruned_data_5, 5, xf.SampleN(3))):
    tuple = (tp.train, tp.test)
    tt_tuples.append(tuple)

In [132]:
test_data=[]
tt_tuples =[]
for i in range(0,3):
    train = pd.read_csv('steam/train_pruned-{}.csv'.format(i))
    test = pd.read_csv('steam/test_pruned-{}.csv'.format(i))
    test_data.append(test)
    tuple = (train, test)
    tt_tuples.append(tuple)

In [19]:
eval = MultiEval('my-eval', recommend=10)
eval.add_datasets(pairs, name='steam')
eval.add_algorithms(tf_idf(), name='tf_idf')
eval.run()

KeyError: "['rating'] not in index"

In [133]:
eval = MultiEval('my-eval', recommend=10)
eval.add_datasets(tt_tuples, name='steam')
eval.add_algorithms(tf_idf(), name='tf_idf')
eval.run()

TypeError: sequence item 132: expected str instance, float found