In [1]:
import pandas as pd
import numpy as np

import pymongo

from dotenv import load_dotenv
import os

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
download = False
if download:
    load_dotenv()
    mongo_username = os.getenv("mongo_username")
    mongo_password = os.getenv("mongo_password")
    
    client = pymongo.MongoClient(f"mongodb+srv://{mongo_username}:{mongo_password}@cluster0.d4ojg.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
    
    db = client.SteamDB
    
    games_collection = db.steam_games
    user_collection = db.steam_users
    user_games_collection = db.steam_user_games
    user_reviews_collection = db.steam_user_reviews
    
    games_data = list(games_collection.find())
    users_data = list(user_collection.find())
    user_games_data = list(user_games_collection.find())
    user_reviews_data = list(user_reviews_collection.find())
    
    df_games = pd.DataFrame(games_data)
    df_users = pd.DataFrame(users_data)
    df_user_games = pd.DataFrame(user_games_data)
    df_user_reviews = pd.DataFrame(user_reviews_data)
    
    df_games.to_csv('../data/raw/games_data.csv')
    df_users.to_csv('../data/raw/users_data.csv')
    df_user_games.to_csv('../data/raw/user_games_data.csv')
    df_user_reviews.to_csv('../data/raw/user_reviews_data.csv')
else:
    df_games = pd.read_csv('../data/raw/games_data.csv', index_col=0)
    df_users = pd.read_csv('../data/raw/users_data.csv', index_col=0)
    df_user_games = pd.read_csv('../data/raw/user_games_data.csv', index_col=0)
    df_user_reviews = pd.read_csv('../data/raw/user_reviews_data.csv', index_col=0)

In [3]:
df_games.drop('_id', axis=1, inplace=True)
df_users.drop('_id', axis=1, inplace=True)
df_user_games.drop('_id', axis=1, inplace=True)
df_user_reviews.drop('_id', axis=1, inplace=True)

In [3]:
df = pd.read_csv('../data/raw/steamAPI_ownedGames.csv', index_col=0)

In [4]:
condition = [
    df['playtime_forever'].between(0, 5),
    df['playtime_forever'].between(5, 15),
    df['playtime_forever'].between(15, 60),
    df['playtime_forever'].between(60, 720),
    df['playtime_forever'].between(720, 7200),
    df['playtime_forever'].between(7200, 28800),
    df['playtime_forever'].between(28800, 86400),
    df['playtime_forever'].between(86400, 172800),
    df['playtime_forever'].between(172800, 518400)]

choices = [0, 2, 2.5, 3, 3.5, 4, 4.5, 4.7, 4.9]

df['rating'] = np.select(condition, choices, 5)

In [30]:
df

Unnamed: 0,steamid,appid,playtime_forever,rating
0,76561198123999673,50,2,0.0
1,76561198123999673,60,83,3.0
2,76561198123999673,70,1691,3.5
3,76561198123999673,130,408,3.0
4,76561198123999673,220,4357,3.5
...,...,...,...,...
99344,76561198207321007,880940,431,3.0
99345,76561198207321007,945360,666,3.0
99346,76561198207321007,12210,65,3.0
99347,76561198207321007,1305420,0,0.0


In [4]:
df_user_games

Unnamed: 0,userId,gameId,Title,Hours_Played
0,benedict,200510,XCOM: Enemy Unknown,120
1,benedict,289070,Sid Meier's Civilization VI,125
2,benedict,72850,The Elder Scrolls V: Skyrim,83
3,benedict,47540,Puzzle Quest 2,74
4,benedict,28050,Deus Ex: Human Revolution,36
...,...,...,...,...
596699,76561198106701525,291480,Warface,1.5
596700,76561198106701525,374320,DARK SOULS\u2122 III,1.4
596701,76561198106701525,444090,Paladins,0.4
596702,76561198106701525,1333910,Sizeable,0.3


In [5]:
# df = pd.merge(df_user_games, df_games.drop('Title', axis=1), on=['gameId'], how='left')
df = df_games.copy()

df.drop(['Release_Date', 'Description', 'Recent_Reviews', 'All_Reviews', 'Meta_Critic_Score', 'Steam_Awards'], axis=1, inplace=True)
df.dropna(subset=['Tags'], inplace=True)

df['Genre'] = df['Genre'].fillna('')
df['Publisher'] = df['Publisher'].fillna(df['Developer'])
df['Franchise'] = df['Franchise'].fillna(df['Title'])
df['Price'] = df['Price'].fillna(0)

df.reset_index(drop=True, inplace=True)

df.Tags = df.Tags.apply(lambda x: ', '.join(x.strip('[]\'').split("', '")))
# df.Tags = df.Tags.apply(lambda x: x.strip('[]\'').split("', '"))
df.Features = df.Features.apply(lambda x: x.strip('[]\'').split("', '"))

df.info()
# df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1417 entries, 0 to 1416
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   gameId     1417 non-null   int64 
 1   Title      1417 non-null   object
 2   Genre      1417 non-null   object
 3   Developer  1417 non-null   object
 4   Publisher  1417 non-null   object
 5   Franchise  1417 non-null   object
 6   Tags       1417 non-null   object
 7   Features   1417 non-null   object
 8   Price      1417 non-null   object
dtypes: int64(1), object(8)
memory usage: 99.8+ KB


In [6]:
df.head()

Unnamed: 0,gameId,Title,Genre,Developer,Publisher,Franchise,Tags,Features,Price
0,730,Counter-Strike: Global Offensive,"Action, Free to Play","Valve, Hidden Path Entertainment",Valve,Counter-Strike: Global Offensive,"FPS, Shooter, Multiplayer, Competitive, Action...","[Steam Achievements, Full controller support, ...","123,00 TL"
1,570,Dota 2,"Action, Free to Play, Strategy",Valve,Valve,Dota,"Free to Play, MOBA, Multiplayer, Strategy, eSp...","[Steam Trading Cards, Steam Workshop, SteamVR ...",0
2,578080,PUBG: BATTLEGROUNDS,"Action, Adventure, Free to Play, Massively Mul...","KRAFTON, Inc.","KRAFTON, Inc.",PUBG: BATTLEGROUNDS,"Survival, Shooter, Multiplayer, Battle Royale,...","[Online PvP, Stats, Remote Play on Phone, Remo...","109,00 TL"
3,1245620,ELDEN RING,"Action, RPG",FromSoftware Inc.,"FromSoftware Inc., BANDAI NAMCO Entertainment",ELDEN RING,"Souls-like, Relaxing, Dark Fantasy, RPG, Diffi...","[Single-player, Online PvP, Online Co-op, Stea...","499,00 TL"
4,1172470,Apex Legends™,"Action, Adventure, Free to Play",Respawn Entertainment,Electronic Arts,Apex Legends,"Free to Play, Battle Royale, Multiplayer, Shoo...","[Online PvP, Online Co-op, Steam Achievements,...","109,00 TL"


In [7]:
cm = CountVectorizer().fit_transform(df.Tags)

In [8]:
cs = cosine_similarity(cm)

In [9]:
cs

array([[1.        , 0.56273143, 0.68810235, ..., 0.36018014, 0.11677484,
        0.06558258],
       [0.56273143, 1.        , 0.46466019, ..., 0.34669749, 0.10375717,
        0.05827165],
       [0.68810235, 0.46466019, 1.        , ..., 0.39654528, 0.09642365,
        0.10830607],
       ...,
       [0.36018014, 0.34669749, 0.39654528, ..., 1.        , 0.14019993,
        0.14763465],
       [0.11677484, 0.10375717, 0.09642365, ..., 0.14019993, 1.        ,
        0.07658396],
       [0.06558258, 0.05827165, 0.10830607, ..., 0.14763465, 0.07658396,
        1.        ]])

In [20]:
title = 'DayZ'

game_id = df[df.Title == title].index[0]

In [21]:
game_id

30

In [22]:
scores = list(enumerate(cs[game_id]))

In [23]:
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
sorted_scores = sorted_scores[1:]

In [24]:
sorted_scores

[(1283, 0.8471737420873575),
 (535, 0.8257228238447707),
 (12, 0.7706746355884525),
 (1329, 0.7627700713964736),
 (27, 0.7624563678786219),
 (759, 0.7440729491417853),
 (878, 0.7434844114105212),
 (351, 0.7395739969534468),
 (335, 0.7356123579206248),
 (1018, 0.7309879850882872),
 (1365, 0.7265990145507549),
 (996, 0.7253235664820745),
 (115, 0.7191012094028308),
 (1196, 0.7164977208318384),
 (543, 0.7154547587901782),
 (325, 0.7068454221751481),
 (61, 0.7059781184061313),
 (102, 0.6968683416725756),
 (242, 0.6963106238227915),
 (1351, 0.6881023532039755),
 (906, 0.6827887419989188),
 (26, 0.6823230454059214),
 (249, 0.6796590597837963),
 (170, 0.6788335930269978),
 (599, 0.6784005252999683),
 (627, 0.6777389936698862),
 (1295, 0.6777389936698861),
 (94, 0.6746990849399868),
 (1403, 0.6746990849399866),
 (549, 0.6672976811635086),
 (631, 0.666666666666667),
 (1181, 0.6636648415023737),
 (191, 0.6567895774291851),
 (893, 0.6565706694547586),
 (740, 0.6560798513518474),
 (184, 0.65247269

In [25]:
j=0
game_titles=[]
recommendation=[]
print('The 10 most recommended games to', title, 'are:\n')
for item in sorted_scores:
    game_title = df[df.index == item[0]]['Title'].values[0]
    game_titles.append(game_title)
    print(j+1, game_title)
    j+=1
    if j>9:
        break

The 10 most recommended games to DayZ are:

1 Miscreated
2 No More Room in Hell
3 Rust
4 Survarium
5 Unturned
6 Infestation: The New Z
7 S.T.A.L.K.E.R.: Call of Pripyat
8 Beasts of Bermuda
9 Stay Out
10 S.T.A.L.K.E.R.: Clear Sky


In [16]:
df_user_reviews = df_user_reviews.copy()
df_user_reviews['Recomendation'] = np.where(df_user_reviews.Recomendation == "Recommended", 1, -1)

df_games_reviews = pd.merge(df_user_games, df_user_reviews, on=['gameId', 'userId'], how='left')

df_games_reviews['Recomendation'] = df_games_reviews['Recomendation'].fillna(0)
df_games_reviews['Review_Time'] = df_games_reviews['Review_Time'].fillna(0)
df_games_reviews['Comment'] = df_games_reviews['Comment'].fillna('')

df_games_reviews.reset_index(drop=True, inplace=True)

df_games_reviews

Unnamed: 0,userId,gameId,Title,Hours_Played,Recomendation,Review_Time,Comment
0,benedict,200510,XCOM: Enemy Unknown,120,1.0,120.3,Went back to this game as a preparation for Xc...
1,benedict,289070,Sid Meier's Civilization VI,125,0.0,0,
2,benedict,72850,The Elder Scrolls V: Skyrim,83,1.0,69.9,One of the best games ever
3,benedict,47540,Puzzle Quest 2,74,0.0,0,
4,benedict,28050,Deus Ex: Human Revolution,36,0.0,0,
...,...,...,...,...,...,...,...
596699,76561198106701525,291480,Warface,1.5,0.0,0,
596700,76561198106701525,374320,DARK SOULS\u2122 III,1.4,0.0,0,
596701,76561198106701525,444090,Paladins,0.4,0.0,0,
596702,76561198106701525,1333910,Sizeable,0.3,0.0,0,


In [17]:
df2 = df_games_reviews[df_games_reviews.Title.isin(game_titles)]

In [18]:
df2.loc[:, ['userId', 'Title', 'Hours_Played', 'Recomendation']][df2['userId']=="-scape-"]

Unnamed: 0,userId,Title,Hours_Played,Recomendation
63558,-scape-,Half-Life,11.6,0.0
63560,-scape-,Portal,4.0,1.0
63581,-scape-,Half-Life 2,2.6,0.0
63588,-scape-,Portal 2,8.9,0.0


In [19]:
df_wide = pd.pivot_table(df2, index='Title', columns='userId', values='Recomendation', aggfunc=np.sum)
df_wide.shape

(33, 2716)

In [20]:
df_wide = df_wide.fillna(0)
df_wide.iloc[:10, 0:5]

userId,-Dutch-,-Maniac,-__7355608__-,-scape-,011010010110111001101111
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ANNO: Mutationem,0.0,0.0,0.0,0.0,0.0
Alice: Madness Returns,0.0,0.0,0.0,0.0,0.0
Biped,0.0,0.0,0.0,0.0,0.0
Black Mesa,0.0,0.0,0.0,0.0,0.0
Borderlands 2,0.0,0.0,0.0,0.0,0.0
Borderlands: The Pre-Sequel,0.0,0.0,0.0,0.0,0.0
Celeste,0.0,0.0,0.0,0.0,0.0
Charlie's Adventure,0.0,0.0,0.0,0.0,0.0
Crab Game,0.0,0.0,0.0,0.0,0.0
DDraceNetwork,0.0,0.0,0.0,0.0,0.0


In [21]:
# import distance methods
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances

In [22]:
dists = cosine_similarity(df_wide)
dists

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [23]:
dists = pd.DataFrame(dists, columns=df_wide.index)

dists.index = dists.columns
dists.iloc[0:10, 0:10]

Title,ANNO: Mutationem,Alice: Madness Returns,Biped,Black Mesa,Borderlands 2,Borderlands: The Pre-Sequel,Celeste,Charlie's Adventure,Crab Game,DDraceNetwork
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ANNO: Mutationem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alice: Madness Returns,0.0,1.0,0.0,0.0,0.101535,0.0,0.0,0.0,0.0,0.0
Biped,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Black Mesa,0.0,0.0,0.0,1.0,0.079993,0.0,0.031846,0.0,0.043769,0.0
Borderlands 2,0.0,0.101535,0.0,0.079993,1.0,0.152302,0.024626,0.050767,0.050767,0.0
Borderlands: The Pre-Sequel,0.0,0.0,0.0,0.0,0.152302,1.0,0.121268,0.125,0.041667,0.0
Celeste,0.0,0.0,0.0,0.031846,0.024626,0.121268,1.0,0.0,0.040423,0.0
Charlie's Adventure,0.0,0.0,0.0,0.0,0.050767,0.125,0.0,1.0,0.0,0.0
Crab Game,0.0,0.0,0.0,0.043769,0.050767,0.041667,0.040423,0.0,1.0,0.0
DDraceNetwork,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
games_i_like = [title]
dists[games_i_like].head()

Title,Portal 2
Title,Unnamed: 1_level_1
ANNO: Mutationem,0.0
Alice: Madness Returns,0.0
Biped,0.0
Black Mesa,0.106991
Borderlands 2,0.14708


In [25]:
games_summed = np.sum(dists[games_i_like], axis=1)

In [26]:
games_summed = games_summed.sort_values(ascending=False)
games_summed

Title
Portal 2                                1.000000
Half-Life 2                             0.229039
Portal                                  0.223520
Half-Life                               0.161669
Borderlands 2                           0.147080
Black Mesa                              0.106991
Halo: The Master Chief Collection       0.098812
Human: Fall Flat                        0.096511
Celeste                                 0.087833
Tomb Raider                             0.081893
Little Nightmares                       0.078406
Neverwinter Nights: Enhanced Edition    0.064018
Death's Door                            0.064018
It Takes Two                            0.034219
Ori and the Blind Forest                0.032009
Crab Game                               0.030179
Halo Infinite                           0.027298
Borderlands: The Pre-Sequel             0.022634
Saints Row IV                           0.018107
Salt and Sanctuary                      0.000000
Time in Time  

In [27]:
ranked_games = games_summed.index[games_summed.index.isin(games_i_like)==False]
ranked_games = ranked_games.tolist()
ranked_games[:5]

['Half-Life 2', 'Portal', 'Half-Life', 'Borderlands 2', 'Black Mesa']

In [28]:
def get_similar(games, n=None):
    """
    calculates which games are most similar to the inputs. Must not return
    the games that were inputted.
    
    Parameters
    ----------
    games: list
        some games!
    
    Returns
    -------
    ranked_games: list
        rank ordered games
    """
    games = [game for game in games if game in dists.columns]
    games_summed = dists[games].apply(lambda row: np.sum(row), axis=1)
    games_summed = games_summed.sort_values(ascending=False)
    ranked_games = games_summed.index[games_summed.index.isin(games)==False]
    ranked_games = ranked_games.tolist()
    if n is None:
        return ranked_games
    else:
        return ranked_games[:n]

In [29]:
for game in get_similar(title, 10):
    print(game)

ANNO: Mutationem
Little Nightmares
Unravel Two
Trine 4: The Nightmare Prince
Tomb Raider
Time in Time
The Outer Worlds
Supraland
Salt and Sanctuary
Saints Row IV


In [30]:
for i, game in enumerate(get_similar(title, 10)):
    print("%d) %s" % (i+1, game))

1) ANNO: Mutationem
2) Little Nightmares
3) Unravel Two
4) Trine 4: The Nightmare Prince
5) Tomb Raider
6) Time in Time
7) The Outer Worlds
8) Supraland
9) Salt and Sanctuary
10) Saints Row IV


In [31]:
# Alien: Isolation

In [32]:
df[df.Title=="It Takes Two"]

Unnamed: 0,gameId,Title,Genre,Developer,Publisher,Franchise,Tags,Features,Price
52,1426210,It Takes Two,"Action, Adventure",Hazelight,Electronic Arts,EA Originals,"Co-op, Split Screen, Multiplayer, Adventure, P...","[Online Co-op, Shared/Split Screen Co-op, Stea...",It Takes Two Friend's Pass


In [2]:
df_tags = pd.read_csv('../data/processed/data.csv', index_col=0)
df_games = pd.read_csv('../data/raw/steamAPI_ownedGames.csv', index_col=0)

condition = [
    df_games['playtime_forever'].between(0, 5),
    df_games['playtime_forever'].between(5, 15),
    df_games['playtime_forever'].between(15, 60),
    df_games['playtime_forever'].between(60, 720),
    df_games['playtime_forever'].between(720, 7200),
    df_games['playtime_forever'].between(7200, 28800),
    df_games['playtime_forever'].between(28800, 86400),
    df_games['playtime_forever'].between(86400, 172800),
    df_games['playtime_forever'].between(172800, 518400)]

choices = [0, 2, 2.5, 3, 3.5, 4, 4.5, 4.7, 4.9]

df_games['rating'] = np.select(condition, choices, 5)

In [3]:
cm = CountVectorizer().fit_transform(df_tags.tags)

cs = cosine_similarity(cm)

In [30]:
def recommend(title):
    game_id = df_tags[df_tags.title == title].index[0]
    
    scores = list(enumerate(cs[game_id]))
    
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    sorted_scores = sorted_scores[1:]
    
    top=10
    recommendation=[]
    for item in sorted_scores[:top]:
        game_id_title = df_tags.loc[df_tags.index == item[0], ['appid', 'title']].values[0]
        recommendation.append(game_id_title)
    
    df_rec = pd.DataFrame(recommendation, columns=['appid', 'title'])

    df = pd.merge(df_rec, df_games, on=['appid'])

    df_wide = pd.pivot_table(df, index='appid', columns='steamid', values='rating', aggfunc=np.sum)

    df_wide = df_wide.fillna(0)

    dists = cosine_similarity(df_wide)

    dists = pd.DataFrame(dists, columns=df_wide.index)

    dists.index = dists.columns

    n = 10
    games = df[df['title']==title]

    games = [game for game in games if game in dists.columns]
    games_summed = dists[games].apply(lambda row: np.sum(row), axis=1)
    games_summed = games_summed.sort_values(ascending=False)
    ranked_games = games_summed.index[games_summed.index.isin(games)==False]
    ranked_games = ranked_games.to_list()
    if n is None:
        return ranked_games
    else:
        return ranked_games[:n]

In [32]:
recommend('Dota 2')

[9900, 212740, 223630, 235340, 236110, 339280, 360620, 427270, 795580, 1226470]