In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

### Data Preprocessing

#### DF4

In [2]:
# import metadata dataset

df4_cleaned = pd.read_csv('clean_df4.csv', index_col=0)
df4_cleaned.drop(columns=['url', 'release_date', 'reviews_url', 'early_access', 'sentiment'], inplace=True)
df4_cleaned.rename(columns={'id': 'item_id'}, inplace=True)

In [3]:
def sanitize(x):
    if isinstance(x, list):
        # strip spaces and convert all to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # check if director exists
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ""

In [4]:
df4_cleaned['developer']

0                       ['kotoshiro']
1                ['secret level srl']
2                    ['poolians.com']
3                            ['彼岸领域']
4                                  []
                     ...             
31306          ['nikita "ghost_rus"']
31307                      ['sacada']
31308    ['laush dmitriy sergeevich']
31309               ['xropi,stev3ns']
31310                              []
Name: developer, Length: 31301, dtype: object

In [5]:
# Clean the metadata fields

df4_cleaned.fillna({'genres': '[]'}, inplace=True)
for feature in ['tags', 'genres', 'specs', 'developer', 'publisher']:
    df4_cleaned[feature] = df4_cleaned[feature].apply(literal_eval)
    df4_cleaned[feature] = df4_cleaned[feature].apply(sanitize)


In [6]:
df4_cleaned['tags']

0            [strategy, action, indie, casual, simulation]
1        [freetoplay, strategy, indie, rpg, cardgame, t...
2        [freetoplay, simulation, sports, casual, indie...
3                              [action, adventure, casual]
4                          [action, indie, casual, sports]
                               ...                        
31306                [strategy, indie, casual, simulation]
31307                            [strategy, indie, casual]
31308                          [indie, simulation, racing]
31309    [indie, casual, puzzle, singleplayer, atmosphe...
31310    [earlyaccess, adventure, indie, action, simula...
Name: tags, Length: 31301, dtype: object

In [7]:
# Create soup

def create_soup(x):
    return  ' '.join(x['genres']) + ' ' + ' '.join(x['tags']) + ' ' + ' '.join(x['specs']) + ' '+ ' '.join(x['publisher']) + ' ' +' '.join(x['developer'])

df4_cleaned['soup'] = df4_cleaned.apply(create_soup, axis=1)

#### DF2

In [8]:
df2_cleaned = pd.read_csv('clean_df2.csv', index_col=0)
df2_cleaned.drop(columns=['user_url'], inplace=True)

In [9]:
df2_df4_merge = pd.merge(df2_cleaned, df4_cleaned, left_on='item_id', right_on='item_id', how='inner')

In [10]:
df2_df4_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4288227 entries, 0 to 4288226
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   steam_id          int64  
 3   item_id           float64
 4   item_name         object 
 5   playtime_forever  float64
 6   playtime_2weeks   float64
 7   publisher         object 
 8   genres            object 
 9   title             object 
 10  tags              object 
 11  specs             object 
 12  price             float64
 13  developer         object 
 14  soup              object 
dtypes: float64(4), int64(2), object(9)
memory usage: 490.7+ MB


In [11]:
df2_cleaned.drop(columns=['steam_id'], inplace=True)

#### Merge DF

In [12]:
df2_df4_merge = pd.merge(df2_cleaned, df4_cleaned, left_on='item_id', right_on='item_id', how='inner')

In [13]:
# merge item_name and title -> title

df2_df4_merge["item_name_cleaned"] = df2_df4_merge['item_name'].fillna(df2_df4_merge['title'])

df2_df4_merge = df2_df4_merge.dropna(subset=['item_name', 'title'], how='all')

df2_df4_merge.drop(columns=['item_name', 'title'], inplace=True)
df2_df4_merge.rename(columns={'item_name_cleaned': 'title'}, inplace=True)

In [14]:
df4_cleaned['tags']

0            [strategy, action, indie, casual, simulation]
1        [freetoplay, strategy, indie, rpg, cardgame, t...
2        [freetoplay, simulation, sports, casual, indie...
3                              [action, adventure, casual]
4                          [action, indie, casual, sports]
                               ...                        
31306                [strategy, indie, casual, simulation]
31307                            [strategy, indie, casual]
31308                          [indie, simulation, racing]
31309    [indie, casual, puzzle, singleplayer, atmosphe...
31310    [earlyaccess, adventure, indie, action, simula...
Name: tags, Length: 31301, dtype: object

In [15]:
df2_df4_merge['developer']

0                                            [valve]
1                                            [valve]
2                                            [valve]
3                                            [valve]
4                                  [gearboxsoftware]
                             ...                    
4288222                             [reperiostudios]
4288223                                           []
4288224                          [coagucoindustries]
4288225                              [tamationgames]
4288226    [rocksteadystudios,feralinteractive(mac)]
Name: developer, Length: 4288227, dtype: object

In [16]:
df2_df4_merge.head()

Unnamed: 0,user_id,items_count,item_id,playtime_forever,playtime_2weeks,publisher,genres,tags,specs,price,developer,soup,title
0,76561197970982479,277,10.0,6.0,0.0,[valve],[action],"[action, fps, multiplayer, shooter, classic, t...","[multi-player, valveanti-cheatenabled]",9.99,[valve],action action fps multiplayer shooter classic ...,Counter-Strike
1,76561197970982479,277,20.0,0.0,0.0,[valve],[action],"[action, fps, multiplayer, classic, shooter, c...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action action fps multiplayer classic shooter ...,Team Fortress Classic
2,76561197970982479,277,30.0,7.0,0.0,[valve],[action],"[fps, worldwarii, multiplayer, action, shooter...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action fps worldwarii multiplayer action shoot...,Day of Defeat
3,76561197970982479,277,40.0,0.0,0.0,[valve],[action],"[action, fps, multiplayer, classic, shooter, f...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action action fps multiplayer classic shooter ...,Deathmatch Classic
4,76561197970982479,277,50.0,0.0,0.0,[valve],[action],"[fps, action, sci-fi, singleplayer, classic, s...","[single-player, multi-player, valveanti-cheate...",4.99,[gearboxsoftware],action fps action sci-fi singleplayer classic ...,Half-Life: Opposing Force


In [17]:
df2_df4_merge.dropna(subset=['soup', 'item_id'], inplace=True)

In [18]:
df2_df4_merge.shape

(4271421, 13)

In [19]:
len(df2_df4_merge)

4271421

In [20]:
df2_df4_merge.head()

Unnamed: 0,user_id,items_count,item_id,playtime_forever,playtime_2weeks,publisher,genres,tags,specs,price,developer,soup,title
0,76561197970982479,277,10.0,6.0,0.0,[valve],[action],"[action, fps, multiplayer, shooter, classic, t...","[multi-player, valveanti-cheatenabled]",9.99,[valve],action action fps multiplayer shooter classic ...,Counter-Strike
1,76561197970982479,277,20.0,0.0,0.0,[valve],[action],"[action, fps, multiplayer, classic, shooter, c...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action action fps multiplayer classic shooter ...,Team Fortress Classic
2,76561197970982479,277,30.0,7.0,0.0,[valve],[action],"[fps, worldwarii, multiplayer, action, shooter...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action fps worldwarii multiplayer action shoot...,Day of Defeat
3,76561197970982479,277,40.0,0.0,0.0,[valve],[action],"[action, fps, multiplayer, classic, shooter, f...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action action fps multiplayer classic shooter ...,Deathmatch Classic
4,76561197970982479,277,50.0,0.0,0.0,[valve],[action],"[fps, action, sci-fi, singleplayer, classic, s...","[single-player, multi-player, valveanti-cheate...",4.99,[gearboxsoftware],action fps action sci-fi singleplayer classic ...,Half-Life: Opposing Force


In [21]:
df2_df4_merge.drop_duplicates(subset=['user_id', 'item_id'], keep='first', inplace=True)

In [22]:
df2_df4_merge.duplicated(subset=['user_id', 'item_id']).sum()

np.int64(0)

In [23]:
df2_df4_merge.shape

(4222232, 13)

In [24]:
items_df = df2_df4_merge.drop_duplicates(subset=['item_id'], keep='first').reset_index(drop=True)[['item_id', 'title', 'soup']]

## Data Featuring

#### Make implicit rating

In [25]:
df2_df4_merge['ratings']  = df2_df4_merge['playtime_forever'].apply(lambda x: np.log1p(x))

In [26]:
df2_df4_merge['ratings']

0          1.945910
1          0.000000
2          2.079442
3          0.000000
4          0.000000
             ...   
4288221    3.784190
4288222    0.000000
4288223    0.000000
4288224    1.386294
4288225    1.609438
Name: ratings, Length: 4222232, dtype: float64

In [27]:
df2_df4_merge['playtime_2weeks'].max()

np.float64(19967.0)

Doesn't make sense since 19967 minutes = 332 hours = 13.8 days -> Play 2 weeks consecutively

In [28]:
df2_df4_merge = df2_df4_merge.loc[df2_df4_merge['playtime_2weeks'] < (14*3*60)] # maximum average 3 hours a day in 2 weeks

In [29]:
# Use playtime data to create ratings

df2_df4_merge['ratings'] = df2_df4_merge['playtime_forever'].apply(lambda x: np.log1p(x)) + df2_df4_merge['playtime_2weeks'].apply(lambda x: np.log1p(x))

In [30]:
df2_df4_merge['ratings'] # The higher rating, the more user likes the game

0          1.945910
1          0.000000
2          2.079442
3          0.000000
4          0.000000
             ...   
4288221    7.568379
4288222    0.000000
4288223    0.000000
4288224    2.772589
4288225    3.218876
Name: ratings, Length: 4219566, dtype: float64

## Model Building

### 1, Game Similarity Calculation

Using Cosine Similarity Matrix

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
def content_recommender(title, cosine_sim, df):
    # get the index of the movie that matches the title
    idx = df.index[df['title']==title].tolist()[0]
   
    # get the pairwsie similarity scores of all movies with input movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies (ignore the first movie).
    sim_scores = sim_scores[1:11]

    # get the movie indices
    game_indices = [i[0] for i in sim_scores]

    df['sim'] = cosine_sim[idx]

    # return the top 10 most similar movies
    return df.iloc[game_indices, :][['title', 'sim']]

In [33]:
# Feature Extraction using soup

tfidf = TfidfVectorizer(stop_words='english')
best_tfidf = tfidf.fit_transform(items_df['soup'])
best_matrix = best_tfidf

In [34]:
best_matrix.shape

(8990, 7009)

In [35]:
best_cosine_sim = cosine_similarity(best_matrix, best_matrix)

In [36]:
items_df['title']

0                                     Counter-Strike
1                              Team Fortress Classic
2                                      Day of Defeat
3                                 Deathmatch Classic
4                          Half-Life: Opposing Force
                            ...                     
8985                                    Mimpi Dreams
8986    NOBUNAGA'S AMBITION: Souzou SengokuRisshiden
8987                                      ChaosTower
8988                   Aveyond 4: Shadow Of The Mist
8989                                   Arachnophobia
Name: title, Length: 8990, dtype: object

In [37]:
game_title = "Counter-Strike"

In [38]:
content_recommender(game_title, best_cosine_sim, items_df)

Unnamed: 0,title,sim
231,Counter-Strike: Condition Zero,0.729891
9,Counter-Strike: Source,0.702722
2,Day of Defeat,0.661861
139,Counter-Strike: Global Offensive,0.644215
21,Half-Life Deathmatch: Source,0.623832
3,Deathmatch Classic,0.61996
1,Team Fortress Classic,0.609159
19,Half-Life 2: Deathmatch,0.586521
8,Day of Defeat: Source,0.581412
35,Left 4 Dead,0.552508


### 2, Game Recommendation

#### a, Item-item: Assume the top-played games of an user as preference, then use similarity calculation to suggest games

In [39]:
def get_game_id(title, df):
    game_id = df.loc[df['title'] == title, 'id']
    if game_id.empty:
        return None
    return game_id.values[0]

In [40]:
def k_neighbors(game_id, df, n=10):
    if game_id not in df['item_id'].values:
        return None
    
    return_df = content_recommender(df.loc[df['item_id'] == game_id, 'title'].values[0], best_cosine_sim, df)

    return return_df.head(n)


In [41]:
def user_game_recommendation(user_id, n=10):
    if user_id not in df2_df4_merge['user_id'].values:
        return None
    
    recommendations = []
    
    games = df2_df4_merge.loc[df2_df4_merge['user_id'] == user_id].sort_values(['playtime_2weeks', 'playtime_forever'], ascending=False)
    if len(games) > 3:
        games = games.head(3)
        i = n // 3
    else:
        i = n // len(games)
    games = games['item_id'].values
    for game in games:
        recs_game = k_neighbors(game, items_df, i)
        recommendations.append(recs_game)

    return pd.concat(recommendations).drop_duplicates().reset_index(drop=True)

In [42]:
items_df.loc[items_df['title'] == "Mimpi Dreams"]

Unnamed: 0,item_id,title,soup,sim
8985,444770.0,Mimpi Dreams,adventure indie indie adventure cute puzzle-pl...,0.001762


In [43]:
df2_df4_merge['user_id']

0          76561197970982479
1          76561197970982479
2          76561197970982479
3          76561197970982479
4          76561197970982479
                 ...        
4288221    76561198329548331
4288222    76561198329548331
4288223    76561198329548331
4288224    76561198329548331
4288225    76561198329548331
Name: user_id, Length: 4219566, dtype: object

In [44]:
user_game_recommendation("76561197970982479")

Unnamed: 0,title,sim
0,Counter-Strike: Source,0.76611
1,Counter-Strike: Condition Zero,0.715153
2,Day of Defeat,0.682087
3,Red Orchestra: Ostfront 41-45,0.862472
4,Killing Floor,0.669118
5,Killing Floor 2,0.661986
6,Sid Meier's Civilization: Beyond Earth,0.795172
7,Sid Meier's Civilization IV,0.56319
8,Geometry Wars 3: Dimensions Evolved,0.532996


These results are so close to the top-played game: Counter-Strike

#### b, Combine with user ratings (made from playtime data)

In [45]:
df2_df4_merge.head()

Unnamed: 0,user_id,items_count,item_id,playtime_forever,playtime_2weeks,publisher,genres,tags,specs,price,developer,soup,title,ratings
0,76561197970982479,277,10.0,6.0,0.0,[valve],[action],"[action, fps, multiplayer, shooter, classic, t...","[multi-player, valveanti-cheatenabled]",9.99,[valve],action action fps multiplayer shooter classic ...,Counter-Strike,1.94591
1,76561197970982479,277,20.0,0.0,0.0,[valve],[action],"[action, fps, multiplayer, classic, shooter, c...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action action fps multiplayer classic shooter ...,Team Fortress Classic,0.0
2,76561197970982479,277,30.0,7.0,0.0,[valve],[action],"[fps, worldwarii, multiplayer, action, shooter...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action fps worldwarii multiplayer action shoot...,Day of Defeat,2.079442
3,76561197970982479,277,40.0,0.0,0.0,[valve],[action],"[action, fps, multiplayer, classic, shooter, f...","[multi-player, valveanti-cheatenabled]",4.99,[valve],action action fps multiplayer classic shooter ...,Deathmatch Classic,0.0
4,76561197970982479,277,50.0,0.0,0.0,[valve],[action],"[fps, action, sci-fi, singleplayer, classic, s...","[single-player, multi-player, valveanti-cheate...",4.99,[gearboxsoftware],action fps action sci-fi singleplayer classic ...,Half-Life: Opposing Force,0.0


In [46]:
items_df.head()

Unnamed: 0,item_id,title,soup,sim
0,10.0,Counter-Strike,action action fps multiplayer shooter classic ...,0.077537
1,20.0,Team Fortress Classic,action action fps multiplayer classic shooter ...,0.077093
2,30.0,Day of Defeat,action fps worldwarii multiplayer action shoot...,0.128318
3,40.0,Deathmatch Classic,action action fps multiplayer classic shooter ...,0.043219
4,50.0,Half-Life: Opposing Force,action fps action sci-fi singleplayer classic ...,0.062242


In [47]:
item_list = items_df["item_id"].astype(float).tolist()
item_list

[10.0,
 20.0,
 30.0,
 40.0,
 50.0,
 60.0,
 70.0,
 130.0,
 300.0,
 240.0,
 3830.0,
 2630.0,
 3900.0,
 3920.0,
 6400.0,
 6910.0,
 7670.0,
 409710.0,
 220.0,
 320.0,
 340.0,
 360.0,
 380.0,
 400.0,
 420.0,
 11450.0,
 7940.0,
 4700.0,
 12900.0,
 13250.0,
 16100.0,
 15700.0,
 15710.0,
 17330.0,
 22000.0,
 500.0,
 4560.0,
 17460.0,
 10500.0,
 24740.0,
 22200.0,
 26800.0,
 1250.0,
 35420.0,
 3590.0,
 8880.0,
 8890.0,
 35700.0,
 10140.0,
 35140.0,
 29180.0,
 15520.0,
 32370.0,
 37700.0,
 6020.0,
 550.0,
 8980.0,
 41500.0,
 20900.0,
 10180.0,
 17450.0,
 3170.0,
 25900.0,
 24980.0,
 8850.0,
 409720.0,
 46000.0,
 33230.0,
 20820.0,
 47700.0,
 24960.0,
 8190.0,
 49600.0,
 21090.0,
 33930.0,
 17410.0,
 50130.0,
 62100.0,
 31280.0,
 57300.0,
 8930.0,
 41000.0,
 41050.0,
 41060.0,
 227780.0,
 18040.0,
 22380.0,
 42700.0,
 62000.0,
 40800.0,
 9500.0,
 18700.0,
 58200.0,
 70400.0,
 41300.0,
 41800.0,
 42500.0,
 49900.0,
 55000.0,
 63200.0,
 620.0,
 42910.0,
 15500.0,
 18500.0,
 26500.0,
 35460.0,
 3870

In [48]:
item_order = {item: order for item, order in enumerate(item_list)}
item_order

{0: 10.0,
 1: 20.0,
 2: 30.0,
 3: 40.0,
 4: 50.0,
 5: 60.0,
 6: 70.0,
 7: 130.0,
 8: 300.0,
 9: 240.0,
 10: 3830.0,
 11: 2630.0,
 12: 3900.0,
 13: 3920.0,
 14: 6400.0,
 15: 6910.0,
 16: 7670.0,
 17: 409710.0,
 18: 220.0,
 19: 320.0,
 20: 340.0,
 21: 360.0,
 22: 380.0,
 23: 400.0,
 24: 420.0,
 25: 11450.0,
 26: 7940.0,
 27: 4700.0,
 28: 12900.0,
 29: 13250.0,
 30: 16100.0,
 31: 15700.0,
 32: 15710.0,
 33: 17330.0,
 34: 22000.0,
 35: 500.0,
 36: 4560.0,
 37: 17460.0,
 38: 10500.0,
 39: 24740.0,
 40: 22200.0,
 41: 26800.0,
 42: 1250.0,
 43: 35420.0,
 44: 3590.0,
 45: 8880.0,
 46: 8890.0,
 47: 35700.0,
 48: 10140.0,
 49: 35140.0,
 50: 29180.0,
 51: 15520.0,
 52: 32370.0,
 53: 37700.0,
 54: 6020.0,
 55: 550.0,
 56: 8980.0,
 57: 41500.0,
 58: 20900.0,
 59: 10180.0,
 60: 17450.0,
 61: 3170.0,
 62: 25900.0,
 63: 24980.0,
 64: 8850.0,
 65: 409720.0,
 66: 46000.0,
 67: 33230.0,
 68: 20820.0,
 69: 47700.0,
 70: 24960.0,
 71: 8190.0,
 72: 49600.0,
 73: 21090.0,
 74: 33930.0,
 75: 17410.0,
 76: 501

In [49]:
user_id = '76561197970982479'

In [50]:
item_list = items_df["item_id"].astype(str).tolist()
item_order = {item: order for order, item in enumerate(item_list)}

user_ratings = df2_df4_merge[['user_id', 'item_id', 'ratings']]
user_ratings = user_ratings.loc[user_ratings['user_id'] == user_id, :]
user_ratings['item_id'] = user_ratings['item_id'].astype(str)

In [51]:
# Make rt_matrix

user_ratings = user_ratings.groupby("item_id", as_index=False)["ratings"].sum()
user_ratings


Unnamed: 0,item_id,ratings
0,10.0,1.945910
1,10140.0,3.258097
2,10180.0,7.542744
3,104700.0,3.806662
4,10500.0,5.231109
...,...,...
226,9480.0,0.000000
227,9500.0,1.945910
228,96100.0,2.302585
229,98200.0,2.302585


In [52]:
matrix_order = user_ratings['item_id'].map(item_order).to_numpy()
rt_matrix = np.zeros(len(item_list), dtype=np.float32)
rt_matrix[matrix_order] = user_ratings['ratings']

In [53]:
rt_matrix

array([1.9459101, 0.       , 2.0794415, ..., 0.       , 0.       ,
       0.       ], shape=(8990,), dtype=float32)

In [54]:
# Make function for convenience

def make_user_ratings(df, user_id):
    user_ratings = df[['user_id', 'item_id', 'ratings']]
    user_ratings = user_ratings.loc[user_ratings['user_id'] == user_id, :]
    user_ratings['item_id'] = user_ratings['item_id'].astype(str)

    # Make orders to map
    user_ratings = user_ratings.groupby("item_id", as_index=False)["ratings"].sum()
    matrix_order = user_ratings['item_id'].map(item_order).to_numpy()

    # Make rt_matrix
    rt_matrix = np.zeros(len(item_list), dtype=np.float32)
    rt_matrix[matrix_order] = user_ratings['ratings']

    return rt_matrix
    

In [55]:
make_user_ratings(df2_df4_merge, user_id)

array([1.9459101, 0.       , 2.0794415, ..., 0.       , 0.       ,
       0.       ], shape=(8990,), dtype=float32)

In [56]:
# adjust cosine = user-item rt_matrix @ item-item cosine similarity

def adjust_cosine(df, user_id, raw_cosine):

    rt_matrix = make_user_ratings(df, user_id)
    adjust_sims = rt_matrix @ raw_cosine

    return adjust_sims

In [57]:
cosine = adjust_cosine(df2_df4_merge, user_id, best_cosine_sim)

In [58]:
# exclude_played_games=True: do not include games which this user played in the recommendation list

def make_prediction(df, user_id, raw_cosine, exclude_played_games=True, k=10):

    cosine = adjust_cosine(df, user_id, raw_cosine)

    sorted_items = pd.DataFrame({"item_id": items_df['item_id'].values, "sims": cosine})
    sorted_items = sorted_items.sort_values(by=['sims'], ascending=False)

    played_game_ids = (df.loc[df['user_id'] == user_id, 'item_id'].unique())
    if exclude_played_games is True:
        sorted_new_games = sorted_items[~sorted_items['item_id'].isin(played_game_ids)]
        
    else:
        sorted_new_games = sorted_items

    top_k_item = sorted_new_games['item_id'].head(k).astype(np.float64)
    recommended_games = items_df.set_index("item_id").loc[top_k_item].reset_index()

    return recommended_games[['title']]


In [59]:
make_prediction(df2_df4_merge, user_id, best_cosine_sim, exclude_played_games=True, k=10)

Unnamed: 0,title
0,Serious Sam 2
1,Call of Duty: Black Ops II
2,Tom Clancy's Rainbow Six: Vegas 2
3,Sanctum 2
4,Sanctum
5,Counter-Strike: Condition Zero
6,Tom Clancy's Splinter Cell Blacklist
7,Arma 3
8,Tom Clancy's The Division
9,Aliens vs. Predator


The results are close to Counter-Strike, while this user didn't play them (e.g: this (user, item) pair is not in the dataset)

## Model Evaluation/Validation

### a, Check truly recommended games' occurrences

In [60]:
val_ds = pd.read_csv("cleaned_merged_all_data.csv")

In [61]:
val_ds.loc[val_ds['user_id'] == user_id]

Unnamed: 0,user_id,review_text,posted,item_id,recommend,item_name,playtime_forever,playtime_2weeks,publisher,genres,title,tags,specs,price,developer
163,76561197970982479,It's unique and worth a playthrough.,2011-07-15,22200.0,True,Zeno Clash,271,0,ACE Team,"['Action', 'Indie']",Zeno Clash,"['Action', 'Indie', 'Surreal', ""Beat 'em up"", ...","['Single-player', 'Steam Achievements', 'Steam...",9.99,ACE Team
239,76561197970982479,Simple yet with great replayability. In my opi...,2011-11-05,1250.0,True,Killing Floor,10006,0,Tripwire Interactive,['Action'],Killing Floor,"['FPS', 'Zombies', 'Co-op', 'Survival', 'Actio...","['Single-player', 'Multi-player', 'Co-op', 'Cr...",19.99,Tripwire Interactive


In [62]:
make_prediction(df2_df4_merge, user_id, best_cosine_sim, exclude_played_games=False, k=20)

Unnamed: 0,title
0,Serious Sam 2
1,Call of Duty: Black Ops II
2,Portal 2
3,Borderlands
4,Left 4 Dead 2
5,Left 4 Dead
6,Tom Clancy's Rainbow Six: Vegas 2
7,Sanctum 2
8,Sanctum
9,RAGE


In [63]:
user_game_recommendation("76561197970982479")

Unnamed: 0,title,sim
0,Counter-Strike: Source,0.76611
1,Counter-Strike: Condition Zero,0.715153
2,Day of Defeat,0.682087
3,Red Orchestra: Ostfront 41-45,0.862472
4,Killing Floor,0.669118
5,Killing Floor 2,0.661986
6,Sid Meier's Civilization: Beyond Earth,0.795172
7,Sid Meier's Civilization IV,0.56319
8,Geometry Wars 3: Dimensions Evolved,0.532996


So Killing Floor is in top 10 (top 4 when using games' metadata only and top 6 when using games' metadata + playtime), but Zeno Clash is not in top 20. The result may be he just played this game 271 minutes, but still clicked on "recommended".

In [64]:
df2_df4_merge[(df2_df4_merge['title'] == 'Zeno Clash') & (df2_df4_merge['user_id'] == user_id)]

Unnamed: 0,user_id,items_count,item_id,playtime_forever,playtime_2weeks,publisher,genres,tags,specs,price,developer,soup,title,ratings
40,76561197970982479,277,22200.0,271.0,0.0,[aceteam],"[action, indie]","[action, indie, surreal, beat'emup, fps, fight...","[single-player, steamachievements, steamtradin...",9.99,[aceteam],action indie action indie surreal beat'emup fp...,Zeno Clash,5.605802


### b, Compare the metadata of truly recommended games and predicted games

In [65]:
prediction = make_prediction(df2_df4_merge, user_id, best_cosine_sim, exclude_played_games=True, k=10)

In [66]:
prediction_games = prediction['title'].tolist()
validation = df2_df4_merge.loc[df2_df4_merge['title'].isin(prediction_games), ['title', 'genres', 'tags', 'specs']]
validation = validation.drop_duplicates(subset=['title'])

In [67]:
validation

Unnamed: 0,title,genres,tags,specs
232,Counter-Strike: Condition Zero,[action],"[action, fps, shooter, multiplayer, singleplay...","[single-player, multi-player, valveanti-cheate..."
342,Sanctum,"[action, casual, indie, strategy]","[towerdefense, strategy, fps, action, co-op, i...","[single-player, multi-player, co-op, cross-pla..."
523,Sanctum 2,"[action, indie, strategy]","[towerdefense, fps, co-op, strategy, action, i...","[single-player, multi-player, co-op, steamachi..."
1656,Call of Duty: Black Ops II,[action],"[action, multiplayer, fps, shooter, first-pers...","[single-player, multi-player, co-op, steamachi..."
1898,Aliens vs. Predator,[action],"[action, fps, aliens, sci-fi, multiplayer, hor...","[single-player, multi-player, co-op, steamachi..."
1974,Serious Sam 2,"[action, indie]","[action, fps, co-op, comedy, multiplayer, firs...","[single-player, multi-player, co-op, partialco..."
2116,Arma 3,"[action, simulation, strategy]","[simulation, military, multiplayer, realistic,...","[single-player, multi-player, onlinemulti-play..."
2125,Tom Clancy's The Division,"[action, adventure, rpg]","[openworld, third-personshooter, multiplayer, ...","[single-player, multi-player, co-op, steamtrad..."
2244,Tom Clancy's Rainbow Six: Vegas 2,[action],"[action, tactical, fps, co-op, onlineco-op, mu...","[single-player, multi-player, co-op]"
4178,Tom Clancy's Splinter Cell Blacklist,"[action, adventure]","[stealth, action, co-op, thirdperson, multipla...","[single-player, multi-player, co-op]"


In [68]:
recommended_game = 'Killing Floor'
val_row = val_ds.loc[val_ds['title'] == recommended_game, ['title', 'genres', 'tags', 'specs']].iloc[[0]]
for feature in ['tags', 'genres', 'specs']:
    val_row[feature] = val_row[feature].apply(literal_eval)
    val_row[feature] = val_row[feature].apply(sanitize)

In [69]:
def check_metadata_occ(validation_df, val_row):
    val_genres = val_row['genres'].tolist()[0]
    val_tags = val_row['tags'].tolist()[0]
    val_specs = val_row['specs'].tolist()[0]

    def check(details, name):
        count = 0
        sum = 0
        lst = validation_df[name].tolist()
        for detail in details:
            count = 0
            for i in range(len(lst)):
                if detail in lst[i]:
                    count+=1
                
            if count > 5:
                sum+=1
        return sum
        
    sum_genres = check(val_genres, 'genres')
    genres_percentage = sum_genres / len(val_genres)

    sum_tags = check(val_tags, 'tags')
    tags_percentage = sum_tags / len(val_tags)

    sum_specs = check(val_specs, 'specs')
    specs_percentage = sum_specs / len(val_specs)

    print(f'genres: {genres_percentage * 100}%')
    print(f'tags: {tags_percentage * 100}%')
    print(f'specs: {specs_percentage * 100}%')
    return 

In [70]:
check_metadata_occ(validation, val_row)

genres: 100.0%
tags: 50.0%
specs: 30.0%


So, for every single genres/tags/specs of the Killing Floor, I calculate how many times they appear in at least 5 out of 10 predicted games. The results are so good: 100% for genres, 50% and 50% for tags and specs. It proves that the truly recommended game is relevant to predicted games

Another example

In [71]:
df2_df4_merge['user_id'].unique()

array(['76561197970982479', 'js41637', 'evcentric', ...,
       '76561198323066619', '76561198326700687', '76561198329548331'],
      shape=(68700,), dtype=object)

In [72]:
prediction = make_prediction(df2_df4_merge, "js41637", best_cosine_sim, exclude_played_games=True, k=10)

In [73]:
prediction_games = prediction['title'].tolist()
validation = df2_df4_merge.loc[df2_df4_merge['title'].isin(prediction_games), ['title', 'genres', 'tags', 'specs']]
validation = validation.drop_duplicates(subset=['title'])
validation

Unnamed: 0,title,genres,tags,specs
1084,FortressCraft Evolved,"[adventure, casual, indie, rpg, simulation, st...","[sandbox, crafting, building, survival, indie,...","[single-player, multi-player, onlinemulti-play..."
1090,Starbound,"[action, adventure, casual, indie, rpg]","[sandbox, adventure, survival, indie, crafting...","[single-player, multi-player, onlinemulti-play..."
1770,Dead Rising 3,[action],"[zombies, action, openworld, co-op, gore, come...","[single-player, multi-player, co-op, steamachi..."
1797,The Escapists,"[action, adventure, indie, simulation, strategy]","[pixelgraphics, strategy, crafting, singleplay...","[single-player, steamachievements, steamtradin..."
1974,Serious Sam 2,"[action, indie]","[action, fps, co-op, comedy, multiplayer, firs...","[single-player, multi-player, co-op, partialco..."
3893,Labyronia RPG,"[action, adventure, indie, rpg]","[rpgmaker, rpg, adventure, anime, action, indi...","[single-player, steamachievements, fullcontrol..."
9192,Deadfall Adventures,"[action, adventure]","[adventure, action, fps, shooter, first-person...","[single-player, multi-player, co-op, steamachi..."
16714,Secret Of Magia,"[action, adventure, indie, rpg]","[rpgmaker, anime, rpg, indie, adventure, actio...","[single-player, steamachievements, fullcontrol..."
55811,Selknam Defense,"[action, indie, strategy]","[strategy, towerdefense, indie, action, advent...","[single-player, steamachievements, fullcontrol..."
78447,A Walk in the Dark,"[action, indie]","[indie, platformer, action, difficult, 2d, gre...","[single-player, steamachievements, fullcontrol..."


In [74]:
val_ds.loc[val_ds['user_id'] == 'js41637', :]

Unnamed: 0,user_id,review_text,posted,item_id,recommend,item_name,playtime_forever,playtime_2weeks,publisher,genres,title,tags,specs,price,developer
2054,js41637,For a simple (it's actually not all that simpl...,2013-09-08,227300.0,True,Euro Truck Simulator 2,551,0,SCS Software,"['Indie', 'Simulation']",Euro Truck Simulator 2,"['Simulation', 'Driving', 'Open World', 'Reali...","['Single-player', 'Steam Achievements', 'Steam...",19.99,SCS Software
2747,js41637,Very fun little game to play when your bored o...,2013-11-29,239030.0,True,"Papers, Please",349,0,3909,"['Adventure', 'Indie']","Papers, Please","['Indie', 'Political', 'Simulation', 'Point & ...","['Single-player', 'Steam Achievements', 'Steam...",9.99,3909


In [75]:
recommended_game = 'Euro Truck Simulator 2'
val_row = val_ds.loc[val_ds['title'] == recommended_game, ['title', 'genres', 'tags', 'specs']].iloc[[0]]
for feature in ['tags', 'genres', 'specs']:
    val_row[feature] = val_row[feature].apply(literal_eval)
    val_row[feature] = val_row[feature].apply(sanitize)

check_metadata_occ(validation, val_row)

genres: 50.0%
tags: 25.0%
specs: 50.0%
