In [2]:
import pandas as pd
import numpy as np

import pymongo

from dotenv import load_dotenv
import os

In [252]:
download = False
if download:
    load_dotenv()
    mongo_username = os.getenv("mongo_username")
    mongo_password = os.getenv("mongo_password")
    
    client = pymongo.MongoClient(f"mongodb+srv://{mongo_username}:{mongo_password}@cluster0.d4ojg.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
    
    db = client.SteamDB
    
    games_collection = db.steam_games
    user_collection = db.steam_users
    user_games_collection = db.steam_user_games
    user_reviews_collection = db.steam_user_reviews
    
    games_data = list(games_collection.find())
    users_data = list(user_collection.find())
    user_games_data = list(user_games_collection.find())
    user_reviews_data = list(user_reviews_collection.find())
    
    df_games = pd.DataFrame(games_data)
    df_users = pd.DataFrame(users_data)
    df_user_games = pd.DataFrame(user_games_data)
    df_user_reviews = pd.DataFrame(user_reviews_data)
    
    df_games.to_csv('../data/raw/games_data.csv')
    df_users.to_csv('../data/raw/users_data.csv')
    df_user_games.to_csv('../data/raw/user_games_data.csv')
    df_user_reviews.to_csv('../data/raw/user_reviews_data.csv')
else:
    df_games = pd.read_csv('../data/raw/games_data.csv', index_col=0)
    df_users = pd.read_csv('../data/raw/users_data.csv', index_col=0)
    df_user_games = pd.read_csv('../data/raw/user_games_data.csv', index_col=0)
    df_user_reviews = pd.read_csv('../data/raw/user_reviews_data.csv', index_col=0)

In [253]:
df_games.drop('_id', axis=1, inplace=True)
df_users.drop('_id', axis=1, inplace=True)
df_user_games.drop('_id', axis=1, inplace=True)
df_user_reviews.drop('_id', axis=1, inplace=True)

df_games.info()
df_users.info()
df_user_games.info()
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1417 entries, 0 to 1416
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gameId             1417 non-null   int64  
 1   Title              1417 non-null   object 
 2   Genre              1415 non-null   object 
 3   Developer          1417 non-null   object 
 4   Publisher          1413 non-null   object 
 5   Franchise          652 non-null    object 
 6   Release_Date       1411 non-null   object 
 7   Description        1417 non-null   object 
 8   Tags               1417 non-null   object 
 9   Features           1417 non-null   object 
 10  Price              1379 non-null   object 
 11  Recent_Reviews     1292 non-null   object 
 12  All_Reviews        1417 non-null   object 
 13  Meta_Critic_Score  544 non-null    float64
 14  Steam_Awards       281 non-null    object 
dtypes: float64(1), int64(1), object(13)
memory usage: 177.1+ KB
<class 'pand

In [238]:
df_user_reviews = df_user_reviews.copy()
df_user_reviews['Recomendation'] = np.where(df_user_reviews.Recomendation == "Recommended", 1, -1)

df_games_reviews = pd.merge(df_user_games, df_user_reviews, on=['gameId', 'userId'], how='left')

df_games_reviews['Recomendation'] = df_games_reviews['Recomendation'].fillna(0)
df_games_reviews['Review_Time'] = df_games_reviews['Review_Time'].fillna(0)
df_games_reviews['Comment'] = df_games_reviews['Comment'].fillna('')

## df_games_reviews.dropna(inplace=True)
## df_games_reviews.reset_index(drop=True, inplace=True)

df_games_reviews

Unnamed: 0,userId,gameId,Title,Hours_Played,Recomendation,Review_Time,Comment
0,benedict,200510,XCOM: Enemy Unknown,120,1.0,120.3,Went back to this game as a preparation for Xc...
1,benedict,289070,Sid Meier's Civilization VI,125,,0,
2,benedict,72850,The Elder Scrolls V: Skyrim,83,1.0,69.9,One of the best games ever
3,benedict,47540,Puzzle Quest 2,74,,0,
4,benedict,28050,Deus Ex: Human Revolution,36,,0,
...,...,...,...,...,...,...,...
596699,76561198106701525,291480,Warface,1.5,,0,
596700,76561198106701525,374320,DARK SOULS\u2122 III,1.4,,0,
596701,76561198106701525,444090,Paladins,0.4,,0,
596702,76561198106701525,1333910,Sizeable,0.3,,0,


In [254]:
df_test = df_games_reviews[df_games_reviews.Title.isin(['Team Fortress 2'])].sort_values('userId', axis=0)
df_test.sample(10)

df_games_reviews.Title.value_counts()

n = 250
top_n = df_games_reviews.Title.value_counts().index[:n]
df = df_games_reviews[df_games_reviews['Title'].isin(top_n)]
df.head()

df.loc[:, ['userId', 'Title', 'Hours_Played', 'Recomendation']][df['userId']=="011010010110111001101111"]

df_wide = pd.pivot_table(df, index='Title', columns='userId', values='Recomendation', aggfunc=np.sum)
df_wide.shape

df_wide = df_wide.fillna(0)

df_wide.iloc[:, 0:5]

# pd.Series(df_wide.index[:10])

userId,-Dutch-,-Maniac,-__7355608__-,-bak-,-scape-
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7 Days to Die,0.0,0.0,0.0,0.0,0.0
A Story About My Uncle,0.0,0.0,0.0,0.0,0.0
APB Reloaded,0.0,0.0,0.0,0.0,0.0
ARK: Survival Evolved,0.0,0.0,0.0,0.0,0.0
AdVenture Capitalist,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
Warframe,0.0,0.0,0.0,0.0,0.0
Warhammer: Vermintide 2,0.0,0.0,0.0,0.0,0.0
XCOM: Enemy Unknown,0.0,0.0,0.0,0.0,0.0
Z1 Battle Royale,0.0,0.0,0.0,0.0,0.0


## Find Similarities

In [242]:
# import distance methods
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances

In [243]:
dists = cosine_similarity(df_wide)
dists

array([[ 1.        , -0.05661385, -0.04622502, ...,  0.08006408,
         0.0541332 ,  0.03268602],
       [-0.05661385,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.04622502,  0.        ,  1.        , ...,  0.        ,
         0.048795  , -0.05892557],
       ...,
       [ 0.08006408,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.0541332 ,  0.        ,  0.048795  , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.03268602,  0.        , -0.05892557, ...,  0.        ,
         0.        ,  1.        ]])

In [244]:
dists = pd.DataFrame(dists, columns=df_wide.index)

dists.index = dists.columns
dists.iloc[0:10, 0:10]

Title,7 Days to Die,A Story About My Uncle,APB Reloaded,ARK: Survival Evolved,AdVenture Capitalist,Age of Empires II (2013),Aim Lab,Alien Swarm,Amnesia: The Dark Descent,Among Us
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7 Days to Die,1.0,-0.056614,-0.046225,0.079446,0.0,-0.03338903,0.0,0.0,-0.050637,0.013485
A Story About My Uncle,-0.056614,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111803,0.059549
APB Reloaded,-0.046225,0.0,1.0,0.035806,0.05892557,0.06019293,-0.102062,-0.109109,-0.091287,0.048622
ARK: Survival Evolved,0.079446,0.0,0.035806,1.0,0.0,-0.02586303,0.043853,0.0,0.0,0.010446
AdVenture Capitalist,0.0,0.0,0.058926,0.0,1.0,-1.460126e-18,0.0,0.0,0.0,0.034381
Age of Empires II (2013),-0.033389,0.0,0.060193,-0.025863,-1.460126e-18,1.0,0.0,0.078811,0.0,0.01756
Aim Lab,0.0,0.0,-0.102062,0.043853,0.0,0.0,1.0,0.133631,0.111803,0.059549
Alien Swarm,0.0,0.0,-0.109109,0.0,0.0,0.07881104,0.133631,1.0,0.119523,-0.03183
Amnesia: The Dark Descent,-0.050637,0.111803,-0.091287,0.0,0.0,0.0,0.111803,0.119523,1.0,0.053262
Among Us,0.013485,0.059549,0.048622,0.010446,0.03438071,0.01756008,0.059549,-0.03183,0.053262,1.0


In [245]:
games_i_like = ['Team Fortress 2', 'Portal']
dists[games_i_like].head()

Title,Team Fortress 2,Portal
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
7 Days to Die,0.038492,0.0
A Story About My Uncle,-0.014165,0.054554
APB Reloaded,0.034697,0.0
ARK: Survival Evolved,0.049693,0.019139
AdVenture Capitalist,0.024534,0.094491


In [246]:
games_summed = np.sum(dists[games_i_like], axis=1)

In [247]:
games_summed = games_summed.sort_values(ascending=False)
games_summed

Title
Team Fortress 2          1.105095
Portal                   1.105095
Half-Life 2              0.505640
Garry's Mod              0.490049
Half-Life                0.460194
                           ...   
Ring of Elysium         -0.016356
Darkest Dungeon\u00ae   -0.020032
Z1 Battle Royale        -0.027088
Quake Champions         -0.036126
Crusader Kings II       -0.046524
Length: 250, dtype: float64

In [248]:
ranked_games = games_summed.index[games_summed.index.isin(games_i_like)==False]
ranked_games = ranked_games.tolist()
ranked_games[:5]

['Half-Life 2',
 "Garry's Mod",
 'Half-Life',
 'Half-Life 2: Episode One',
 'Left 4 Dead 2']

In [249]:
def get_similar(games, n=None):
    """
    calculates which games are most similar to the inputs. Must not return
    the games that were inputted.
    
    Parameters
    ----------
    games: list
        some games!
    
    Returns
    -------
    ranked_games: list
        rank ordered games
    """
    games = [game for game in games if game in dists.columns]
    games_summed = dists[games].apply(lambda row: np.sum(row), axis=1)
    games_summed = games_summed.sort_values(ascending=False)
    ranked_games = games_summed.index[games_summed.index.isin(games)==False]
    ranked_games = ranked_games.tolist()
    if n is None:
        return ranked_games
    else:
        return ranked_games[:n]

In [250]:
for game in get_similar(["Alien: Isolation"], 10):
    print(game)

7 Days to Die
Project Zomboid
PUBG: BATTLEGROUNDS
Paladins
Papers, Please
Path of Exile
Pavlov VR
Phasmophobia
Plague Inc: Evolved
PlanetSide 2


In [251]:
for i, game in enumerate(get_similar(["Portal 2", "Team Fortress 2", "Garry's Mod"], 10)):
    print("%d) %s" % (i+1, game))

1) Left 4 Dead 2
2) Half-Life 2
3) Terraria
4) Portal
5) PAYDAY 2
6) Among Us
7) Counter-Strike: Global Offensive
8) Half-Life
9) Undertale
10) Don't Starve Together


## Movie Recommendations with Recsys