In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('data/steam-200k.csv')
data.head()

Unnamed: 0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [4]:
cols = {'151603712': 'Game_ID', 'The Elder Scrolls V Skyrim': 'Game_Name', 'purchase': 'Status', '1.0': 'Hour_Played'}
data.rename(columns=cols, inplace=True)
data.head()

Unnamed: 0,Game_ID,Game_Name,Status,Hour_Played,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [5]:
data.drop(columns=['0'], inplace=True)

In [6]:
data.shape

(199999, 4)

In [7]:
data.duplicated().sum()

707

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.isnull().sum()

Game_ID        0
Game_Name      0
Status         0
Hour_Played    0
dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199292 entries, 0 to 199998
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Game_ID      199292 non-null  int64  
 1   Game_Name    199292 non-null  object 
 2   Status       199292 non-null  object 
 3   Hour_Played  199292 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 7.6+ MB


In [11]:
data.describe().astype('int')
data.nunique()

Game_ID        12393
Game_Name       5155
Status             2
Hour_Played     1593
dtype: int64

In [12]:
data['Game_Name'].unique()

array(['The Elder Scrolls V Skyrim', 'Fallout 4', 'Spore', ...,
       'Space Colony', 'Life is Hard', 'Executive Assault'], dtype=object)

In [13]:
data= data[(data['Hour_Played']>=2) & (data['Status']=='play')]

In [14]:
data = data[data.groupby('Game_Name').Game_ID.transform(len)>=20]

In [15]:
data['Game_Name'].nunique()

431

In [16]:
data['Game_ID'] = data['Game_ID'].astype(str)

In [17]:
average = data.groupby(['Game_Name'], as_index=False).Hour_Played.mean()

In [18]:
average

Unnamed: 0,Game_Name,Hour_Played
0,7 Days to Die,50.777500
1,APB Reloaded,113.377778
2,ARK Survival Evolved,105.982812
3,Ace of Spades,10.971739
4,AdVenture Capitalist,67.760674
...,...,...
426,Worms Reloaded,12.600000
427,Worms Revolution,10.025000
428,XCOM Enemy Unknown,53.725175
429,Zombie Panic Source,88.833333


In [19]:
average['avg_hour_played_y'] = average['Hour_Played']
average.head()

Unnamed: 0,Game_Name,Hour_Played,avg_hour_played_y
0,7 Days to Die,50.7775,50.7775
1,APB Reloaded,113.377778,113.377778
2,ARK Survival Evolved,105.982812,105.982812
3,Ace of Spades,10.971739,10.971739
4,AdVenture Capitalist,67.760674,67.760674


In [20]:
average.head()

Unnamed: 0,Game_Name,Hour_Played,avg_hour_played_y
0,7 Days to Die,50.7775,50.7775
1,APB Reloaded,113.377778,113.377778
2,ARK Survival Evolved,105.982812,105.982812
3,Ace of Spades,10.971739,10.971739
4,AdVenture Capitalist,67.760674,67.760674


In [21]:
average.drop(columns='Hour_Played', inplace=True)

In [22]:
average.head()

Unnamed: 0,Game_Name,avg_hour_played_y
0,7 Days to Die,50.7775
1,APB Reloaded,113.377778
2,ARK Survival Evolved,105.982812
3,Ace of Spades,10.971739
4,AdVenture Capitalist,67.760674


In [23]:
data = data.merge(average, on='Game_Name')

In [24]:
# data.drop(columns='avg_hour_played_y_x', axis=1)
data.head()

Unnamed: 0,Game_ID,Game_Name,Status,Hour_Played,avg_hour_played_y
0,151603712,The Elder Scrolls V Skyrim,play,273.0,115.351792
1,151603712,Fallout 4,play,87.0,66.819876
2,151603712,Spore,play,14.9,37.708889
3,151603712,Fallout New Vegas,play,12.1,62.910638
4,151603712,Left 4 Dead 2,play,8.9,50.333684


In [25]:
condition = [
    data['Hour_Played']>= (0.8*data['avg_hour_played_y']),
   (data['Hour_Played']>=0.6*data['avg_hour_played_y'])&(data['Hour_Played']<0.8*data['avg_hour_played_y']),
   (data['Hour_Played']>=0.4*data['avg_hour_played_y'])&(data['Hour_Played']<0.6*data['avg_hour_played_y']),
   (data['Hour_Played']>=0.2*data['avg_hour_played_y'])&(data['Hour_Played']<0.4*data['avg_hour_played_y']),
    data['Hour_Played']>=0
    
]
values = [5,4,3,2,1]
data['rating'] = np.select(condition,values)

In [26]:
data.head()

Unnamed: 0,Game_ID,Game_Name,Status,Hour_Played,avg_hour_played_y,rating
0,151603712,The Elder Scrolls V Skyrim,play,273.0,115.351792,5
1,151603712,Fallout 4,play,87.0,66.819876,5
2,151603712,Spore,play,14.9,37.708889,2
3,151603712,Fallout New Vegas,play,12.1,62.910638,1
4,151603712,Left 4 Dead 2,play,8.9,50.333684,1


In [27]:
# data.drop(columns = [ 'Status', 'Hour_Played', 'avg_hour_played_y', 'avg_hour_played_x'],inplace =True )

In [28]:
data.head()

Unnamed: 0,Game_ID,Game_Name,Status,Hour_Played,avg_hour_played_y,rating
0,151603712,The Elder Scrolls V Skyrim,play,273.0,115.351792,5
1,151603712,Fallout 4,play,87.0,66.819876,5
2,151603712,Spore,play,14.9,37.708889,2
3,151603712,Fallout New Vegas,play,12.1,62.910638,1
4,151603712,Left 4 Dead 2,play,8.9,50.333684,1


In [29]:
from sklearn.metrics.pairwise import paired_distances, cosine_similarity
from scipy.spatial.distance import cosine, correlation

In [30]:
pv = data.pivot_table(index='Game_Name', columns='Game_ID', values='rating')

In [31]:
pv = pv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)),axis=1)

In [32]:
pv = pv.fillna(0)
pv = pv.T
pv = pv.loc[:,(pv != 0).any(axis=0)]

In [33]:
import scipy as sp
import operator

In [34]:
pv_sparse = sp.sparse.csr_matrix(pv.values)

In [35]:
item_simi = cosine_similarity(pv_sparse)
user_simi = cosine_similarity(pv_sparse.T)


In [36]:
data_item = pd.DataFrame(item_simi, index=pv.index, columns=pv.index)
data_user = pd.DataFrame(user_simi, index=pv.columns, columns=pv.columns)

In [37]:
data_item.head()
# data_user.head()


Game_ID,100053304,100057229,100070732,100096071,100168166,100208126,100267049,100311267,100322840,100351493,...,99701966,99704390,99711581,99713453,99723205,99766416,99802512,99906508,99940330,99992274
Game_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100053304,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100057229,0.0,1.0,0.0,-0.271563,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.64308,0.0,0.252635,0.0,0.0,0.0,0.0,0.0
100070732,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.16803,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100096071,0.0,-0.271563,0.0,1.0,0.0,0.0,0.0,0.045475,0.0,0.0,...,0.0,-0.104475,-0.200883,0.0,-0.158199,0.035008,0.0,0.0,0.207671,0.0
100168166,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.026744,0.0,0.0,...,0.0,-0.512517,0.353749,0.0,0.178282,0.0,0.0,0.0,0.0,0.0


In [40]:
def top_games(game):
  count = 1
  print('Similar Game to {} inlcude \n'.format(game))
  for item in data_item.sort_values(ascending=False, by=game).index[1:6]:
    print('No {}. = {}'.format(count, item))
    count += 1

In [41]:
top_games('Aftermath')

Similar Game to Aftermath inlcude 



KeyError: 'Aftermath'