# Game Prediction

# 1. Data preprocessing

1.1 libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
from numpy import hstack
from functools import reduce
from numpy import array
from itertools import chain
from sklearn.naive_bayes import GaussianNB
# import turicreate as tc

1.2 Reading train and test Data

In [2]:
df=pd.read_csv('train.csv')
df

Unnamed: 0,id,historical_games,next_game
0,2,3 12 262 6094 283 50 1070 233,131
1,4,294 241 1 150 12,101
2,7,85 139 144 57 2013,330
3,10,7 114 10 5 31 6504,19
4,18,5 221 3 712 159 4 810 94 746 6170 136 17 1160 ...,247
...,...,...,...
30583,91402,4 56 8 1078 220 1573 484 838 133 129 51 519 10...,191
30584,91412,24 2186 112 4 145 8 68 1638,341
30585,91415,3 164 6277 2648 846,36
30586,91418,259 2839 986 49 354 367 425 106 614 1089 2886 ...,12


In [3]:
df_test=pd.read_csv('test.csv')
df_test

Unnamed: 0,id,historical_games
0,5,46 37 3 118 3135 683 213 298
1,13,1528 537 287 2023 93 417 4442 1313 245 6234 94...
2,20,207 5 19 332 111
3,25,1412 18 18 3050 728 5 616
4,32,16 64 1 294 449
...,...,...
13068,91410,1 2703 16 11 243
13069,91411,47 173 212 801 23 197
13070,91419,570 349 68 170 229 3201 350 394 1510 25 25
13071,91421,101 41 63 1767 1313 77


1.3 Setting The Parameters

In [4]:
def params(df):
    data = []
    users, h_games = df['id'], df['historical_games']
    X = [h_games[i].split(" ") for i in range(len(h_games))]
    games = np.unique(list(chain.from_iterable(X)))  
#     print(len(games))
#     data = [{users[i]: {games[j]: 0}} for i in range(len(users)) for j in range(len(games))]
    X = pad_sequences(X, padding='pre', maxlen=29)
    
#     return users, pd.DataFrame(X).values.tolist()
    return games ,users, X

# 2. RNN (Recurrent Neural Network)

2.1 libraries

In [5]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt

In [7]:
# from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.layers import Attention
from keras.backend import clear_session

In [8]:
clear_session()

In [9]:
games, ids_train, X_train = params(df)
test_games, ids_test, X_test = params(df_test)
Y_train = df['next_game']

In [10]:
X_train = array(X_train).reshape(array(X_train).shape[0], 29, 1)
X_test = array(X_test).reshape(13073, 29, 1)

In [13]:
def RNN_LSTM_model(X_train, Y_train, shape, units):
    model = keras.models.Sequential()
    model.add(Bidirectional(LSTM(units), input_shape = shape))
#     model.add(Attention())
#     model.add(Bidirectional(LSTM(units)))
    model.add(Dropout(0.2))
    model.add(Dense(7737, activation='softmax'))
    model.compile(optimizer = 'adam', loss="sparse_categorical_crossentropy")
    model.fit(X_train, Y_train, epochs = 10, batch_size = 64) 
    
    return model

In [14]:
model = RNN_LSTM_model(X_train, Y_train, (X_train.shape[1], 1), 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
prediction = model.predict(X_test)

In [19]:
results = []
for pred in prediction:
    pred = list(pred)
    sorted_pred = sorted(pred.copy(), reverse=True)
    result = []
    for i in range(5):
        max_val = sorted_pred[i]
        max_index = list(pred).index(max_val)
        result.append(max_index)
    results.append(result)

In [20]:
final_results = []
for res in results:
    final_results.append(' '.join(map(str, res)))

In [21]:
ids_test = pd.DataFrame(ids_test , columns = ['id'])
ids_test.head()

Unnamed: 0,id
0,5
1,13
2,20
3,25
4,32


In [22]:
pd.concat([ids_test, pd.DataFrame(final_results, columns = ['next_games'])], axis = 1).to_csv(r'RNN_prediction.csv',index=None)

# 3. Recommender system

3.1 Data Preprocessing

In [9]:
def cf_params(df):
    data = []
    users, h_games = df['id'], df['historical_games']
    y = df['next_game']
    X = [(h_games[i] + " " + str(y[i])).split(" ") for i in range(len(h_games))]
    games = np.unique(list(chain.from_iterable(X)))  
    X = pad_sequences(X, padding='pre', maxlen=30)
    
    return games ,users, X

In [10]:
def cf_data_preprocessing(df, df_test):
    games, ids, users = cf_params(df)
    test_games, ids_test, test_users = params(df_test)
    total_games = np.unique(np.concatenate((games, test_games)))
    ids = np.concatenate((ids, ids_test))
    users = np.concatenate((users, test_users))
    
    return total_games, ids, users


In [11]:
def data_prepare(games, users):
    d = [0 for j in range(len(games))]
    data = array([d for i in range(len(users))])
    games_dict = {games[i]:i for i in range(len(games))}
    
    for i in range(len(users)):
        for game in users[i]:
            if game != 0:
                data[i][games_dict[str(game)]] = 1

    games_dict = {i:games[i] for i in range(len(games))}
                    
    return games_dict, np.transpose(data)

3.2 ALS implicit Algorithm

In [19]:
import implicit
from scipy.sparse import coo_matrix

In [None]:
games, ids, users = cf_data_preprocessing(df, df_test)
games_dict ,data = data_prepare(games, users)

In [16]:
data = pd.DataFrame(data, columns=ids)
data.head()

Unnamed: 0,2,4,7,10,18,21,22,24,26,28,29,30,33,41,42,45,50,52,54,55,60,61,63,64,70,74,87,96,99,100,108,110,112,116,121,128,142,143,144,145,...,91201,91205,91206,91230,91232,91239,91241,91248,91250,91277,91280,91283,91287,91288,91289,91290,91303,91313,91317,91321,91329,91330,91337,91344,91350,91358,91359,91360,91368,91373,91379,91380,91387,91388,91391,91410,91411,91419,91421,91423
0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
model = implicit.als.AlternatingLeastSquares(factors=128, regularization=0.05, iterations=15)

In [36]:
model.fit(coo_matrix(data))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [49]:
test_games, ids_test, test_users = params(df_test)
test_games_dict ,test_data = data_prepare(games, test_users)
test_data = pd.DataFrame(test_data, columns=ids_test)

In [50]:
results = model.recommend_all(coo_matrix(test_data).T.tocsr(),N=5,show_progress=True)
results = pd.DataFrame(results).values.tolist()

HBox(children=(FloatProgress(value=0.0, max=13073.0), HTML(value='')))




In [51]:
for i in range(len(results)):
    for j in range(5):
        results[i][j] = games_dict[results[i][j]]

In [53]:
final_results = []
for i in range(0, len(results)):
    final_results.append(' '.join(map(str, results[i])))

In [56]:
test_games, ids_test, test_users = params(df_test)

ids_test = pd.DataFrame(ids_test , columns = ['id'])
ids_test.head()

Unnamed: 0,id
0,5
1,13
2,20
3,25
4,32


In [57]:
pd.concat([ids_test, pd.DataFrame(final_results, columns = ['next_games'])], axis = 1).to_csv(r'prediction.csv', index=0)

# 3.3 Cosine similarity

In [81]:
import turicreate as tc

In [53]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train['historical_games'] = df_train.apply(lambda x: x['historical_games'] + ' ' + str(x['next_game']), axis=1)
df_train.drop(['next_game'], axis = 1, inplace=True)
df = pd.concat([df_train, df_test])

df = pd.DataFrame([[i, k] for i, j in df.iloc[:, 0:2].values for k in j.split()], columns=['user_id', 'game_id'])
df['downloaded'] = 1
df.head()

Unnamed: 0,user_id,game_id,downloaded
0,2,3,1
1,2,12,1
2,2,262,1
3,2,6094,1
4,2,283,1


In [54]:
new_df = pd.pivot_table(df, values='downloaded', index='user_id', columns='game_id')
new_df.head()

game_id,1,10,100,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,101,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,102,1020,1021,1022,1023,1024,1025,1026,1027,1028,1029,103,1030,1031,1032,1033,...,963,964,965,966,967,968,969,97,970,971,972,973,974,975,976,977,978,979,98,980,981,982,983,984,985,986,987,988,989,99,990,991,992,993,994,995,996,997,998,999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1.0,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [17]:
new_df_test = df[df['user_id'].isin(df_test['id'].values)]
new_df_train = df[df['user_id'].isin(df_train['id'].values)]

In [18]:
new_df_train

Unnamed: 0,user_id,game_id,downloaded
0,2,3,1
1,2,12,1
2,2,262,1
3,2,6094,1
4,2,283,1
...,...,...,...
323209,91420,256,1
323210,91420,242,1
323211,91420,410,1
323212,91420,350,1


In [13]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model, pd.DataFrame(recom)

In [18]:
cos_model = tc.item_similarity_recommender.create(tc.SFrame(df), user_id='user_id', item_id='game_id', target='downloaded', similarity_type='cosine')
cose_recom = cos_model.recommend(users=list(df_test['id']), k=5)

In [19]:
cos_df = pd.DataFrame(cose_recom)
prediction = [' '.join(cos_df[cos_df['user_id']==user_id]['game_id'].values.tolist()) for user_id in df_test['id'].values.tolist()]

In [20]:
df_test = pd.read_csv('test.csv')
df_test.drop(['historical_games'], axis=1, inplace=True)
df_test['next_games'] = prediction
df_test.head()

Unnamed: 0,id,next_games
0,5,50 283 6094 262 12
1,13,283 6094 262 12 3
2,20,283 6094 262 12 3
3,25,283 6094 262 12 3
4,32,283 6094 262 12 3


In [22]:
df_test.to_csv('prediction.csv')

3.3 Ranking Factorization

In [15]:
m = tc.ranking_factorization_recommender.create(tc.SFrame(df),
                                    user_id='user_id',
                                    item_id='game_id')

In [18]:
results = m.recommend(users=list(df_test['id']), k=5)

In [20]:
cos_df = pd.DataFrame(results)
prediction = [' '.join(cos_df[cos_df['user_id']==user_id]['game_id'].values.tolist()) for user_id in df_test['id'].values.tolist()]

In [21]:
df_test = pd.read_csv('test.csv')
df_test.drop(['historical_games'], axis=1, inplace=True)
df_test['next_games'] = prediction
df_test.head()

Unnamed: 0,id,next_games
0,5,100 78 352 210 377
1,13,174 502 107 440 529
2,20,67 2 24 1 400
3,25,103 155 91 117 2
4,32,137 121 176 280 105


In [22]:
df_test.to_csv('ranking_factorization_prediction.csv')

3.4 implicit recommoneder

In [25]:
model = tc.recommender.create(tc.SFrame(df), 'user_id', 'game_id', target='downloaded')

In [26]:
results = model.recommend(users=list(df_test['id']), k=5)

In [27]:
cos_df = pd.DataFrame(results)
prediction = [' '.join(cos_df[cos_df['user_id']==user_id]['game_id'].values.tolist()) for user_id in df_test['id'].values.tolist()]

In [28]:
df_test = pd.read_csv('test.csv')
df_test.drop(['historical_games'], axis=1, inplace=True)
df_test['next_games'] = prediction
df_test.head()

Unnamed: 0,id,next_games
0,5,18 14 1 79 5
1,13,124 57 159 65 172
2,20,9 13 10 7 1
3,25,46 51 3 79 40
4,32,11 9 20 13 79


In [29]:
df_test.to_csv('implicit_recommender_prediction.csv', index=0)