In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/steam-200k.csv')
df.head()

Unnamed: 0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [5]:
cols = {'151603712':'game_id', 'The Elder Scrolls V Skyrim':'game_name',
        'purchase':'status', '1.0':'Hourplayed'}

df.rename(columns= cols, inplace=True)

In [6]:
df.head()

Unnamed: 0,game_id,game_name,status,Hourplayed,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [7]:
df.drop(columns=['0'], inplace=True)

In [8]:
df.shape

(199999, 4)

In [9]:
df.duplicated().sum()

707

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.isnull().sum()

game_id       0
game_name     0
status        0
Hourplayed    0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199292 entries, 0 to 199998
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   game_id     199292 non-null  int64  
 1   game_name   199292 non-null  object 
 2   status      199292 non-null  object 
 3   Hourplayed  199292 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 7.6+ MB


In [13]:
df.describe().astype(int)

Unnamed: 0,game_id,Hourplayed
count,199292,199292
mean,103717860,17
std,72120577,138
min,5250,0
25%,47384303,1
50%,86912006,1
75%,154230933,1
max,309903146,11754


In [14]:
df.nunique()

game_id       12393
game_name      5155
status            2
Hourplayed     1593
dtype: int64

In [15]:
df= df[(df['Hourplayed']>=2) & (df['status']=='play')]

In [16]:
df = df[df.groupby('game_name').game_id.transform(len)>=20]

In [17]:
df['game_name'].nunique()

431

In [18]:
df['game_id'] = df['game_id'].astype(str)

In [19]:
average = df.groupby(['game_name'],as_index = False).Hourplayed.mean()

In [20]:
average['avg_hourplayed'] = average['Hourplayed']
average.drop(columns ='Hourplayed',inplace = True )

In [21]:
average.head

<bound method NDFrame.head of                 game_name  avg_hourplayed
0           7 Days to Die       50.777500
1            APB Reloaded      113.377778
2    ARK Survival Evolved      105.982812
3           Ace of Spades       10.971739
4    AdVenture Capitalist       67.760674
..                    ...             ...
426        Worms Reloaded       12.600000
427      Worms Revolution       10.025000
428    XCOM Enemy Unknown       53.725175
429   Zombie Panic Source       88.833333
430             theHunter        8.641379

[431 rows x 2 columns]>

In [22]:
df = df.merge(average,on = 'game_name')

In [23]:
condition = [
    df['Hourplayed']>= (0.8*df['avg_hourplayed']),
   (df['Hourplayed']>=0.6*df['avg_hourplayed'])&(df['Hourplayed']<0.8*df['avg_hourplayed']),
   (df['Hourplayed']>=0.4*df['avg_hourplayed'])&(df['Hourplayed']<0.6*df['avg_hourplayed']),
   (df['Hourplayed']>=0.2*df['avg_hourplayed'])&(df['Hourplayed']<0.4*df['avg_hourplayed']),
    df['Hourplayed']>=0
    
]
values = [5,4,3,2,1]
df['rating'] = np.select(condition,values)

In [24]:
df.head()

Unnamed: 0,game_id,game_name,status,Hourplayed,avg_hourplayed,rating
0,151603712,The Elder Scrolls V Skyrim,play,273.0,115.351792,5
1,151603712,Fallout 4,play,87.0,66.819876,5
2,151603712,Spore,play,14.9,37.708889,2
3,151603712,Fallout New Vegas,play,12.1,62.910638,1
4,151603712,Left 4 Dead 2,play,8.9,50.333684,1


In [25]:
df.drop(columns = [ 'status', 'Hourplayed', 'avg_hourplayed'],inplace =True )

In [26]:
from sklearn.metrics.pairwise import paired_distances, cosine_similarity
from scipy.spatial.distance import cosine, correlation

In [27]:
pv = df.pivot_table(index=['game_id'],columns=['game_name'],values = 'rating')

In [28]:
pv = pv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)),axis=1)

In [29]:
pv = pv.fillna(0)
pv = pv.T
pv = pv.loc[:,(pv != 0).any(axis=0)]

In [30]:
pv.head()

game_id,100057229,100096071,100311267,100322840,100351493,100359523,100431715,100444456,100519466,100630947,...,994489,9946133,99484728,99640715,99704390,99711581,99713453,99723205,99766416,99802512
game_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7 Days to Die,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
APB Reloaded,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.475,-0.166667,0.0,0.0,0.0,0.0
ARK Survival Evolved,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ace of Spades,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AdVenture Capitalist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
import scipy as sp 
import operator

In [32]:
pv_sparse = sp.sparse.csr_matrix(pv.values)

In [33]:
item_simi = cosine_similarity(pv_sparse)
user_simi = cosine_similarity(pv_sparse.T)

In [34]:
df_item = pd.DataFrame(item_simi,index = pv.index ,columns = pv.index)
# User_similarity dataframe
df_user = pd.DataFrame(user_simi , index= pv.columns,columns=pv.columns)

In [35]:
def top_games(game):
    count = 1
    print('similar game to {} include \n'.format(game))
    for item in df_item .sort_values(ascending=False,by = game).index[1:6]:
        print('NO.{} = {}'.format(count,item))
        count+=1

In [36]:
top_games('Aftermath')

similar game to Aftermath include 

NO.1 = Alice Madness Returns
NO.2 = Shadow Warrior
NO.3 = Brtal Legend
NO.4 = Resident Evil 5 / Biohazard 5
NO.5 = Infestation Survivor Stories


In [37]:
def user_id(user):
    if user not in pv.columns:
        print('No data available for this user {}'.format(user))
    sim_val = df_user.sort_values(by =user,ascending=False).loc[:,user].tolist()[1:6]
    sim_user = df_user.sort_values(by = user,ascending=False).index[1:6]
    zipped = zip(sim_user,sim_val)
    for user, sim in zipped:
        print('user #{0},similarity values :{1:.2f}'.format(user,sim))

In [38]:
user_id('99484728')

user #40289887,similarity values :0.73
user #185494712,similarity values :0.71
user #16710264,similarity values :0.71
user #20566124,similarity values :0.67
user #49769103,similarity values :0.67


In [39]:
def simi_user_recs(user):
    if user not in pv.columns:
        print('No data available for this user {}'.format(user))
    sim_user = df_user.sort_values(by = user,ascending=False).index[1:11]
    best = []
    most_common = {}
    for i in sim_user:
        max_scorce = pv.loc[:,i].max()
        best.append(pv[pv.loc[:,i] == max_scorce].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
                   if j in most_common:
                       most_common[j] =+1
                   else :
                       most_common[j] =1
    sorted_item = sorted(most_common.items(),key=operator.itemgetter(1),reverse=True)
    return sorted_item[:6]

In [40]:
simi_user_recs('100519466')

[('Robocraft', 1),
 ('BLOCKADE 3D', 1),
 ("Garry's Mod", 1),
 ('ARK Survival Evolved', 1),
 ('Dino D-Day', 1),
 ('AdVenture Capitalist', 1)]

In [41]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(algorithm='brute',leaf_size=30,metric='cosine',metric_params=None,n_jobs=-1,n_neighbors=20,p=2,radius=1)
knn.fit(pv)

In [42]:
qury = np.random.choice(pv.shape[0])
print("The Choosen Game = ",pv.index[qury])

The Choosen Game =  Hero Siege


In [43]:
distance , indices = knn.kneighbors(pv.iloc[qury,:].values.reshape(1,-1),n_neighbors=6)

In [44]:
for i in range(0,len(distance.flatten())):
    if i == 0:
        print('Recommendation for {0} \n'.format(pv.index[qury]))
    else:
        print('{0} : {1} with distance of {2}'.format(i,pv.index[indices.flatten()[i]], distance.flatten()[i]))

Recommendation for Hero Siege 

1 : Trine with distance of 0.8762969022196175
2 : The Escapists with distance of 0.8776146447283796
3 : Stranded Deep with distance of 0.8818634320081933
4 : Space Pirates and Zombies with distance of 0.8860779687306378
5 : Zombie Panic Source with distance of 0.8880309225466075
