Inspired by Andrew Ng lectures

### Import Libraries

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm


### Load data

In [7]:
df = pd.read_csv('csv/ratings_reduced.csv')
print(f"In this dataset there are {len(df.gameid.unique())} games, {len(df.user.unique())} users and {len(df)} total votes")
df.head(2)

In this dataset there are 554 games, 148308 users and 4946868 total votes


Unnamed: 0,gameid,user,rating
0,188,-Johnny-,5.0
1,188,-Morphling-,7.0


In [8]:
# id of the game - title of the game
dfnames = pd.read_csv('csv/1000games_complete.csv')[['gameid', 'title']]
dfnames.head(2)

Unnamed: 0,gameid,title
0,224517,Brass: Birmingham
1,161936,Pandemic Legacy: Season 1


## Collaborative Filtering

### Preparation
First, Let's create a matrix with a game for each row and a user for each column

In [9]:
def transform(df):
    """ 
    Convert a 'flat' table to a matrix:
    rows: games (specifically gameids)
    columns: users
    Y matrix (#of games X #of users) stores the ratings
    R matrix (#of games X #of users) : 1/0. 1 if a user rated a game, 0 otherwise
    Ymean - average rating for every game
    Ynorm = Y - Ymean  # 
    
    """
    pivot_df = df.pivot(index='gameid', columns='user', values='rating')
    Y = pivot_df.values
    R = np.where(np.isnan(Y), 0, 1)  # 1 if a user rated a game, 0 otherwise
    Y = np.nan_to_num(Y, nan=0)  # convert to numpy array, nan values change for 0s
    
    # mean normalization
    mask = Y != 0
    row_means = np.ma.masked_array(Y, ~mask).mean(axis=1)
    Ymean = np.array([row_means]).T
    Ynorm = np.where(Y != 0, Y - Ymean, Y)  # only if a user rated a game, otherwise it remains 0
    return Y, Ymean, Ynorm, R

In [10]:
# Creating matricies
Y, Ymean, Ynorm, R = transform(df)

In [11]:
# average rating for the first 5 games in our matrix (not the best games)
Ymean[:5]

array([[7.37714084],
       [8.14358174],
       [7.13519132],
       [8.01598796],
       [8.04946373]])

### Cost Function


In [12]:
def cost_func(X, W, b, Y, R, lambda_):
    """
      Returns the cost
    
      X       : matrix of game features
      W       : matrix of user parameters
      b       : vector of user parameters
      Y       : matrix of user ratings of games
      R       : 1/0 matrix, 1 if the game rated by user, 0 otherwise
      lambda_ : regularization parameter

    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

### Initializing the parameters 
W, X, b

In [13]:
num_games, num_users = Y.shape  # number of games, number of users 
print(f"# of games: {num_games}, # of users: {num_users}")
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(2604) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_games,  num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer
optimizer = keras.optimizers.Adam(learning_rate=0.1)

# of games: 554, # of users: 148308


### Training the Model
Let's train *collaborative filtering model* to learn **X, W, b**,  using TensorFlow

In [17]:
# Attention! My Laptop got hot...
iterations = 1000
lambda_ = 1
for iter in tqdm(range(iterations), total=iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cost_func(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # to see the process 
    if iter % 10 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:,.0f}")

  0%|          | 1/1000 [00:16<4:29:52, 16.21s/it]

Training loss at iteration 0: 6,567,081


  1%|          | 11/1000 [03:32<5:09:55, 18.80s/it]

Training loss at iteration 10: 5,144,078


  2%|▏         | 21/1000 [06:14<4:29:50, 16.54s/it]

Training loss at iteration 20: 4,113,807


  3%|▎         | 31/1000 [08:59<4:12:15, 15.62s/it]

Training loss at iteration 30: 3,326,404


  4%|▍         | 41/1000 [11:42<4:19:47, 16.25s/it]

Training loss at iteration 40: 2,727,668


  5%|▌         | 51/1000 [14:37<4:37:42, 17.56s/it]

Training loss at iteration 50: 2,266,972


  6%|▌         | 61/1000 [17:19<4:23:16, 16.82s/it]

Training loss at iteration 60: 1,907,028


  7%|▋         | 71/1000 [19:56<4:02:47, 15.68s/it]

Training loss at iteration 70: 1,622,655


  8%|▊         | 81/1000 [22:46<4:17:43, 16.83s/it]

Training loss at iteration 80: 1,395,846


  9%|▉         | 91/1000 [25:21<4:00:02, 15.84s/it]

Training loss at iteration 90: 1,213,086


 10%|█         | 101/1000 [27:57<3:48:53, 15.28s/it]

Training loss at iteration 100: 1,064,159


 11%|█         | 111/1000 [30:44<4:16:12, 17.29s/it]

Training loss at iteration 110: 941,447


 12%|█▏        | 121/1000 [33:33<4:03:29, 16.62s/it]

Training loss at iteration 120: 839,333


 13%|█▎        | 131/1000 [36:21<4:01:59, 16.71s/it]

Training loss at iteration 130: 753,668


 14%|█▍        | 141/1000 [39:18<3:59:50, 16.75s/it]

Training loss at iteration 140: 681,352


 15%|█▌        | 151/1000 [42:21<3:57:05, 16.76s/it]

Training loss at iteration 150: 620,013


 16%|█▌        | 161/1000 [45:23<4:02:32, 17.35s/it]

Training loss at iteration 160: 567,797


 17%|█▋        | 171/1000 [48:21<3:58:09, 17.24s/it]

Training loss at iteration 170: 523,224


 18%|█▊        | 181/1000 [51:12<3:38:50, 16.03s/it]

Training loss at iteration 180: 485,088


 19%|█▉        | 191/1000 [53:55<3:49:59, 17.06s/it]

Training loss at iteration 190: 452,395


 20%|██        | 201/1000 [56:44<3:27:49, 15.61s/it]

Training loss at iteration 200: 424,311


 21%|██        | 211/1000 [59:46<4:02:08, 18.41s/it]

Training loss at iteration 210: 400,131


 22%|██▏       | 221/1000 [1:02:46<3:49:06, 17.65s/it]

Training loss at iteration 220: 379,254


 23%|██▎       | 231/1000 [1:05:44<3:47:48, 17.77s/it]

Training loss at iteration 230: 361,165


 24%|██▍       | 241/1000 [1:08:34<3:35:21, 17.02s/it]

Training loss at iteration 240: 345,431


 25%|██▌       | 251/1000 [1:11:34<3:47:29, 18.22s/it]

Training loss at iteration 250: 331,683


 26%|██▌       | 261/1000 [1:14:23<3:18:34, 16.12s/it]

Training loss at iteration 260: 319,613


 27%|██▋       | 271/1000 [1:17:22<3:35:27, 17.73s/it]

Training loss at iteration 270: 308,963


 28%|██▊       | 281/1000 [1:20:17<3:35:07, 17.95s/it]

Training loss at iteration 280: 299,522


 29%|██▉       | 291/1000 [1:23:08<3:24:19, 17.29s/it]

Training loss at iteration 290: 291,110


 30%|███       | 301/1000 [1:26:11<3:33:20, 18.31s/it]

Training loss at iteration 300: 283,582


 31%|███       | 311/1000 [1:29:05<3:10:18, 16.57s/it]

Training loss at iteration 310: 276,815


 32%|███▏      | 321/1000 [1:31:58<3:19:16, 17.61s/it]

Training loss at iteration 320: 270,707


 33%|███▎      | 331/1000 [1:34:57<3:15:14, 17.51s/it]

Training loss at iteration 330: 265,173


 34%|███▍      | 341/1000 [1:37:49<3:08:45, 17.19s/it]

Training loss at iteration 340: 260,140


 35%|███▌      | 351/1000 [1:40:45<3:14:39, 18.00s/it]

Training loss at iteration 350: 255,547


 36%|███▌      | 361/1000 [1:43:54<3:24:27, 19.20s/it]

Training loss at iteration 360: 251,342


 37%|███▋      | 371/1000 [1:46:51<3:12:29, 18.36s/it]

Training loss at iteration 370: 247,479


 38%|███▊      | 381/1000 [1:49:35<2:53:46, 16.84s/it]

Training loss at iteration 380: 243,921


 39%|███▉      | 391/1000 [1:52:33<3:02:27, 17.98s/it]

Training loss at iteration 390: 240,634


 40%|████      | 401/1000 [1:55:35<3:07:13, 18.75s/it]

Training loss at iteration 400: 237,590


 41%|████      | 411/1000 [1:58:34<2:58:31, 18.19s/it]

Training loss at iteration 410: 234,765


 42%|████▏     | 421/1000 [2:01:39<2:59:39, 18.62s/it]

Training loss at iteration 420: 232,136


 43%|████▎     | 431/1000 [2:04:33<2:38:04, 16.67s/it]

Training loss at iteration 430: 229,687


 44%|████▍     | 441/1000 [2:07:28<2:47:40, 18.00s/it]

Training loss at iteration 440: 227,395


 45%|████▌     | 451/1000 [2:10:25<2:46:29, 18.20s/it]

Training loss at iteration 450: 225,251


 46%|████▌     | 461/1000 [2:13:30<2:47:53, 18.69s/it]

Training loss at iteration 460: 223,241


 47%|████▋     | 471/1000 [2:16:23<2:40:58, 18.26s/it]

Training loss at iteration 470: 221,353


 48%|████▊     | 481/1000 [2:19:31<2:47:27, 19.36s/it]

Training loss at iteration 480: 219,574


 49%|████▉     | 491/1000 [2:22:27<2:26:56, 17.32s/it]

Training loss at iteration 490: 217,900


 50%|█████     | 501/1000 [2:25:21<2:28:46, 17.89s/it]

Training loss at iteration 500: 216,324


 51%|█████     | 511/1000 [2:28:16<2:23:03, 17.55s/it]

Training loss at iteration 510: 214,840


 52%|█████▏    | 521/1000 [2:31:07<2:18:17, 17.32s/it]

Training loss at iteration 520: 213,424


 53%|█████▎    | 531/1000 [2:34:05<2:18:21, 17.70s/it]

Training loss at iteration 530: 212,088


 54%|█████▍    | 541/1000 [2:36:55<2:07:09, 16.62s/it]

Training loss at iteration 540: 210,816


 55%|█████▌    | 551/1000 [2:39:51<2:16:44, 18.27s/it]

Training loss at iteration 550: 209,613


 56%|█████▌    | 561/1000 [2:43:12<2:27:55, 20.22s/it]

Training loss at iteration 560: 208,456


 57%|█████▋    | 571/1000 [2:46:16<2:09:45, 18.15s/it]

Training loss at iteration 570: 207,373


 58%|█████▊    | 581/1000 [2:49:19<2:09:49, 18.59s/it]

Training loss at iteration 580: 206,333


 59%|█████▉    | 591/1000 [2:52:20<2:07:32, 18.71s/it]

Training loss at iteration 590: 205,349


 60%|██████    | 601/1000 [2:55:31<2:04:26, 18.71s/it]

Training loss at iteration 600: 204,416


 61%|██████    | 611/1000 [2:58:36<1:59:11, 18.39s/it]

Training loss at iteration 610: 203,529


 62%|██████▏   | 621/1000 [3:01:42<1:54:25, 18.11s/it]

Training loss at iteration 620: 202,656


 63%|██████▎   | 631/1000 [3:04:41<1:50:16, 17.93s/it]

Training loss at iteration 630: 201,826


 64%|██████▍   | 641/1000 [3:07:31<1:42:19, 17.10s/it]

Training loss at iteration 640: 201,045


 65%|██████▌   | 651/1000 [3:10:27<1:46:35, 18.33s/it]

Training loss at iteration 650: 200,297


 66%|██████▌   | 661/1000 [3:13:32<1:42:53, 18.21s/it]

Training loss at iteration 660: 199,572


 67%|██████▋   | 671/1000 [3:16:18<1:27:13, 15.91s/it]

Training loss at iteration 670: 198,888


 68%|██████▊   | 681/1000 [3:19:23<1:36:45, 18.20s/it]

Training loss at iteration 680: 198,235


 69%|██████▉   | 691/1000 [3:22:28<1:35:59, 18.64s/it]

Training loss at iteration 690: 197,594


 70%|███████   | 701/1000 [3:25:26<1:26:04, 17.27s/it]

Training loss at iteration 700: 196,981


 71%|███████   | 711/1000 [3:28:23<1:28:25, 18.36s/it]

Training loss at iteration 710: 196,428


 72%|███████▏  | 721/1000 [3:31:23<1:24:52, 18.25s/it]

Training loss at iteration 720: 195,839


 73%|███████▎  | 731/1000 [3:34:26<1:20:45, 18.01s/it]

Training loss at iteration 730: 195,294


 74%|███████▍  | 741/1000 [3:37:22<1:16:14, 17.66s/it]

Training loss at iteration 740: 194,777


 75%|███████▌  | 751/1000 [3:40:20<1:15:13, 18.13s/it]

Training loss at iteration 750: 194,255


 76%|███████▌  | 761/1000 [3:43:31<1:13:45, 18.52s/it]

Training loss at iteration 760: 193,777


 77%|███████▋  | 771/1000 [3:46:50<1:15:56, 19.90s/it]

Training loss at iteration 770: 193,343


 78%|███████▊  | 781/1000 [3:49:54<1:08:59, 18.90s/it]

Training loss at iteration 780: 192,918


 79%|███████▉  | 791/1000 [3:53:03<1:06:01, 18.95s/it]

Training loss at iteration 790: 192,469


 80%|████████  | 801/1000 [3:56:12<1:02:36, 18.88s/it]

Training loss at iteration 800: 192,036


 81%|████████  | 811/1000 [3:59:14<57:09, 18.14s/it]  

Training loss at iteration 810: 191,645


 82%|████████▏ | 821/1000 [4:01:55<45:16, 15.18s/it]

Training loss at iteration 820: 191,261


 83%|████████▎ | 831/1000 [4:04:44<46:12, 16.41s/it]

Training loss at iteration 830: 190,917


 84%|████████▍ | 841/1000 [4:07:44<48:44, 18.39s/it]

Training loss at iteration 840: 190,539


 85%|████████▌ | 851/1000 [4:10:40<42:42, 17.20s/it]

Training loss at iteration 850: 190,193


 86%|████████▌ | 861/1000 [4:13:41<43:21, 18.72s/it]

Training loss at iteration 860: 189,835


 87%|████████▋ | 871/1000 [4:16:41<39:39, 18.45s/it]

Training loss at iteration 870: 189,540


 88%|████████▊ | 881/1000 [4:19:58<37:51, 19.09s/it]

Training loss at iteration 880: 189,240


 89%|████████▉ | 891/1000 [4:22:46<30:06, 16.58s/it]

Training loss at iteration 890: 188,919


 90%|█████████ | 901/1000 [4:25:42<29:12, 17.70s/it]

Training loss at iteration 900: 188,651


 91%|█████████ | 911/1000 [4:28:29<25:59, 17.53s/it]

Training loss at iteration 910: 188,317


 92%|█████████▏| 921/1000 [4:31:26<23:47, 18.07s/it]

Training loss at iteration 920: 188,065


 93%|█████████▎| 931/1000 [4:34:20<19:34, 17.02s/it]

Training loss at iteration 930: 187,835


 94%|█████████▍| 941/1000 [4:37:17<17:49, 18.12s/it]

Training loss at iteration 940: 187,561


 95%|█████████▌| 951/1000 [4:40:28<16:06, 19.72s/it]

Training loss at iteration 950: 187,331


 96%|█████████▌| 961/1000 [4:43:33<11:43, 18.04s/it]

Training loss at iteration 960: 187,070


 97%|█████████▋| 971/1000 [4:46:36<08:48, 18.24s/it]

Training loss at iteration 970: 186,859


 98%|█████████▊| 981/1000 [4:49:30<05:16, 16.65s/it]

Training loss at iteration 980: 186,586


 99%|█████████▉| 991/1000 [4:52:28<02:42, 18.09s/it]

Training loss at iteration 990: 186,428


100%|██████████| 1000/1000 [4:55:14<00:00, 17.71s/it]


### Make predictions
Now after learning **X, W, b**, we are able to make predictions

In [18]:
# Make a prediction using trained X,W and b
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
pm = p + Ymean  # prediction matrix
pm.shape

(554, 148308)

#### Sandbox
it's just for me to practice - predictions for one particular user, and list of his rated games

In [19]:
def make_recommendations(prediction_matrix, df, user, number_of_recommendations=9):
    """ Recommendations for a given user """
    pivot_df = df.pivot(index='gameid', columns='user', values='rating')
    
    reconstructed_df = pd.DataFrame(prediction_matrix, index=pivot_df.index, columns=pivot_df.columns)
    reconstructed_R = pd.DataFrame(R, index=pivot_df.index, columns=pivot_df.columns)
    R # 1 or 0 (rated or not)
    user_df = pd.concat([reconstructed_df[user], reconstructed_R[user]], axis=1)
    user_df.columns = ['rating', 'rated']
    user_df_notrated = user_df.query('rated == 0').rating.sort_values(ascending=False)[:number_of_recommendations]
    
    # rated games
    user_df_rated = user_df.query('rated == 1').rating.sort_values(ascending=False)[:]
    
    # combine with names    
    user_df_notrated = pd.merge(user_df_notrated.to_frame(), dfnames, left_index=True, right_on='gameid', how='left')
    user_df_notrated = user_df_notrated[['rating', 'gameid', 'title']]
    
    # add rated games
    user_df_rated = pd.merge(user_df_rated.to_frame(), dfnames, left_index=True, right_on='gameid', how='left')
    user_df_rated = user_df_rated[['rating', 'gameid', 'title']]
    
    print("Rated games:")
    print(user_df_rated.title)
    print("And now recommendations:")
    recommendation = {'game':  None,
                     'rating': None,
                     }
    return user_df_notrated

In [20]:
# rated games by user 'axci' and recommendations below
axci = make_recommendations(pm, df, 'axci')
axci

Rated games:
2                                         Gloomhaven
90                                        Frosthaven
6                       Gloomhaven: Jaws of the Lion
0                                  Brass: Birmingham
193                                               Go
12     Through the Ages: A New Story of Civilization
10                                     Spirit Island
1                          Pandemic Legacy: Season 1
33                                           Barrage
30                                          Everdell
32                     Viticulture Essential Edition
14                               Great Western Trail
5                                  Terraforming Mars
7                                     Dune: Imperium
28                               Lost Ruins of Arnak
119                                        Codenames
69                                              Azul
128                                    Cartographers
97                               

Unnamed: 0,rating,gameid,title
631,10.115838,242705,Aeon Trespass: Odyssey
272,9.497493,251661,Oathsworn: Into the Deepwood
49,9.216685,314040,Pandemic Legacy: Season 0
88,9.203654,180263,The 7th Continent
389,9.201968,240980,Blood on the Clocktower
407,9.125516,170771,Sword & Sorcery
36,9.1139,175914,Food Chain Magnate
136,9.112481,271055,Dwellings of Eldervale
456,9.095228,169427,Middara: Unintentional Malum – Act 1


#### Function

In [21]:
# function that will create a flat dataframe with user - recommended gameid
def create_recommended_games(prediction_matrix, df, number_of_recommendations=10):
    """ 
    prediction_matrix - our trained matrix
    df -  dataframe from csv file ratings_reduced.csv (gameid-user-rating)
    """
    
    pivot_df = df.pivot(index='gameid', columns='user', values='rating')  # make a pivot table from the flat table
    R = np.where(np.isnan(pivot_df.values), 0, 1)  # 1 if a user rated a game, 0 otherwise
    
    reconstructed_df = pd.DataFrame(prediction_matrix, index=pivot_df.index, columns=pivot_df.columns)  # convert matrix to df 
    reconstructed_R = pd.DataFrame(R, index=pivot_df.index, columns=pivot_df.columns)  # convert R to df
    
    predictions = pd.DataFrame(columns=['prediction_rating', 'rated_bool', 'user'], index=pd.Index([], name='gameid'))  # empty df
    counter = 0
    for user in reconstructed_df.columns:
        user_df = pd.concat([reconstructed_df[user], reconstructed_R[user]], axis=1)
        user_df['user'] = user
        user_df.columns = ['prediction_rating', 'rated_bool', 'user']
        user_df_notrated = user_df.query('rated_bool == 0').sort_values(by='prediction_rating',ascending=False)[:number_of_recommendations]
        predictions = pd.concat([predictions, user_df_notrated])
        counter += 1
        if counter % 10000 == 0:
            print(f"{counter} out of {len(reconstructed_df.columns)} added")
   
    predictions['user'].to_csv('csv/rec_games.csv')  # save to csv
    return predictions

In [22]:
# Run the function to make predictions for every user
predictions = create_recommended_games(pm, df)


10000 out of 148308 added
20000 out of 148308 added
30000 out of 148308 added
40000 out of 148308 added
50000 out of 148308 added
60000 out of 148308 added
70000 out of 148308 added
80000 out of 148308 added
90000 out of 148308 added
100000 out of 148308 added
110000 out of 148308 added
120000 out of 148308 added
130000 out of 148308 added
140000 out of 148308 added


In [23]:
predictions['user'].to_csv('csv/rec_games.csv')

In [5]:
pr_user = pd.read_csv('csv/rec_games.csv')
print(len(pr_user))
pr_user.head(2)
#pr_user[:1000].to_csv('csv/rec_games_reduced.csv', index=False)  # just for testing

1483072


Unnamed: 0,gameid,user
0,295770,-=Yod@=-
1,251661,-=Yod@=-


In [93]:
# the final output 'rec_games.csv' I will use to load to User model of my Django framework