In [1]:
import pandas as pd
import numpy as np

import pymongo

from dotenv import load_dotenv
import os

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [90]:
download = False
if download:
    load_dotenv()
    mongo_username = os.getenv("mongo_username")
    mongo_password = os.getenv("mongo_password")
    
    client = pymongo.MongoClient(f"mongodb+srv://{mongo_username}:{mongo_password}@cluster0.d4ojg.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
    
    db = client.SteamDB
    
    collection = db.steamdb_games
    
    data = list(collection.find())
    
    df = pd.DataFrame(data)
    
    df.to_csv('../data/raw/steamAPI_gameTags.csv')
else:
    df = pd.read_csv('../data/raw/steamAPI_gameTags.csv', index_col=0)

In [167]:
df = pd.read_csv('../data/raw/steamAPI_gameTags.csv', index_col=0)

In [32]:
df.drop('_id', axis=1, inplace=True)

In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5002 entries, 0 to 5001
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   appid   5002 non-null   int64 
 1   title   4714 non-null   object
 2   tags    5002 non-null   object
dtypes: int64(1), object(2)
memory usage: 156.3+ KB


In [169]:
df.tags = df.tags.apply(lambda x: ', '.join(x.strip('[]\'').split("', '")))

In [35]:
rating = {'Overwhelmingly Positive': 8, 'Very Positive': 4, 'Positive': 2,
          'Mostly Positive': 1, 'Mixed': 0, 'Mostly Negative': -1, 'Negative': -2}

In [116]:
df.Review = df.Review.apply(lambda x: x.strip('[]\'').split("', '")[0]).map(rating)

AttributeError: 'DataFrame' object has no attribute 'Review'

In [170]:
df.dropna(axis=0, inplace=True)

In [176]:
df = df.drop_duplicates(subset=['title'], keep='first')
df = df.drop_duplicates(subset=['appid'], keep='first')

In [172]:
df.title = df.title.apply(lambda x: ''.join(x.split('"')))

In [177]:
df = df.reset_index().drop('index', axis=1)

In [178]:
df

Unnamed: 0,appid,title,tags
0,730,Counter-Strike: Global Offensive,"FPS, Shooter, Multiplayer, Competitive, Action..."
1,218620,PAYDAY 2,"Co-op, Action, FPS, Heist, Looter Shooter, Onl..."
2,578080,PUBG: BATTLEGROUNDS,"Survival, Shooter, Multiplayer, Battle Royale,..."
3,304930,Unturned,"Free to Play, Survival, Zombies, Open World Su..."
4,550,Left 4 Dead 2,"Zombies, Co-op, FPS, Multiplayer, Shooter, Act..."
...,...,...,...
4558,43000,Front Mission Evolved,"Mechs, Action, Shooter, Third-Person Shooter, ..."
4559,423760,Hit Tank PRO,"Casual, Action, Indie, Strategy, Adventure, 2D..."
4560,2780,ARMA: Combat Operations,"Action, Simulation, FPS, Shooter, Tactical, Mi..."
4561,570,Dota 2,"Free to Play, MOBA, Multiplayer, Strategy, eSp..."


In [179]:
df.to_csv('../data/processed/data.csv')

In [100]:
df[df.index==1537].title

1537    Anomaly 2
Name: title, dtype: object

In [175]:
df.loc[df.title=='Dota 2 Player Profiles']

Unnamed: 0,appid,title,tags
1876,419910,Dota 2 Player Profiles,"Gaming, Free to Play, Documentary, Episodic, M..."


In [15]:
df.tags.isnull().sum()

0

In [16]:
cm = CountVectorizer().fit_transform(df.tags)

In [17]:
cs = cosine_similarity(cm)

In [18]:
cs

array([[1.        , 0.62622429, 0.68810235, ..., 0.20942695, 0.19364917,
        0.36514837],
       [0.62622429, 1.        , 0.672214  , ..., 0.31475579, 0.21221867,
        0.34299717],
       [0.68810235, 0.672214  , 1.        , ..., 0.51878585, 0.13325045,
        0.30151134],
       ...,
       [0.20942695, 0.31475579, 0.51878585, ..., 1.        , 0.20277678,
        0.22941573],
       [0.19364917, 0.21221867, 0.13325045, ..., 0.20277678, 1.        ,
        0.23570226],
       [0.36514837, 0.34299717, 0.30151134, ..., 0.22941573, 0.23570226,
        1.        ]])

In [20]:
title = 'DayZ'

game_id = df[df.title == title].index[0]

In [21]:
game_id

142

In [22]:
scores = list(enumerate(cs[game_id]))

In [23]:
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
sorted_scores = sorted_scores[1:]

In [24]:
sorted_scores

[(3054, 0.858545710548214),
 (159, 0.8484848484848484),
 (1476, 0.8484848484848484),
 (947, 0.8471737420873575),
 (78, 0.8257228238447707),
 (2039, 0.7909115788387002),
 (986, 0.7789621985578679),
 (24, 0.7706746355884525),
 (3779, 0.7699905035090181),
 (4341, 0.7693218186208297),
 (180, 0.7627700713964736),
 (3, 0.7624563678786219),
 (2560, 0.7566444492037343),
 (618, 0.755648171598035),
 (1534, 0.7543365091413575),
 (146, 0.7440729491417853),
 (819, 0.7434844114105212),
 (910, 0.7433046224826584),
 (2027, 0.7431505414602935),
 (1071, 0.7356123579206248),
 (2842, 0.7340317845665),
 (1159, 0.7309879850882872),
 (2221, 0.726599014550755),
 (170, 0.7265990145507549),
 (2564, 0.7265990145507549),
 (3438, 0.7259540086406279),
 (834, 0.7253235664820745),
 (385, 0.7237468644557459),
 (2841, 0.7237468644557457),
 (883, 0.7191012094028308),
 (1711, 0.7191012094028308),
 (821, 0.7154547587901783),
 (1450, 0.7154547587901782),
 (220, 0.7068454221751481),
 (3273, 0.7061878636037997),
 (383, 0.705

In [180]:
top=50
recommendation=[]
# print('The 10 most recommended games to', title, 'are:\n')
for item in sorted_scores[:top]:
    game_title = df.loc[df.index == item[0], ['appid', 'title']].values[0]
    recommendation.append(game_title)

In [37]:
df.loc[df.index == 1675, ['appid', 'title']]

Unnamed: 0,appid,title
1675,223220,Giana Sisters: Twisted Dreams


In [184]:
pd.DataFrame(recommendation, columns=['appid', 'title'])

Unnamed: 0,appid,title
0,248470,Doorways: Prelude
1,394360,Hearts of Iron IV
2,39160,Dungeon Siege III
3,243470,Watch_Dogs™
4,1046930,Dota Underlords
5,836630,Black Desert - Limited Package
6,17430,Need for Speed Undercover
7,203160,Tomb Raider
8,253330,Neverending Nightmares
9,642520,Trio
