# Movie Rec - MF

## Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse.linalg import svds

## Data

In [2]:
raw_data = pd.read_csv('/data/ephemeral/input/data/train/train_ratings.csv')

In [3]:
raw_data

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


## Matrix Factorization

In [4]:
user_item_matrix = raw_data.groupby(['user', 'item']).size().unstack().fillna(0)
user_item_matrix = user_item_matrix.astype(int)
user_item_matrix

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138473,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138486,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# to save the user_item_matrix

# user_item_matrix.to_csv('user_item_matrix.csv')

In [6]:
from sklearn.preprocessing import StandardScaler

latent_factors = 50

user_item_matrix_numpy = user_item_matrix.to_numpy()

scaler = StandardScaler()
user_item_matrix_numpy = scaler.fit_transform(user_item_matrix)

# Singular value decomposition
U, sigma, Vt = svds(user_item_matrix_numpy, k=latent_factors)

# Diagonalize sigma
sigma = np.diag(sigma)

In [7]:
# Make predictions
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + scaler.mean_

In [8]:
pred_df = pd.DataFrame(predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index)

top10

In [9]:
pred_df

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0.866657,2.151067,0.120304,-0.000208,0.285617,-0.103270,0.158639,0.822199,0.443725,0.249844,...,-0.218355,0.150798,0.211603,0.117677,-0.179458,0.134999,0.311044,-0.060834,-0.112773,-0.196581
14,1.043958,0.166558,0.316365,-0.496615,0.705087,-0.567975,1.263096,0.317165,-0.227764,-0.121803,...,-0.207594,-0.057971,-0.070872,-0.122780,-0.122166,-0.123684,-0.296849,0.194493,-0.029441,-0.125301
18,-0.161020,-0.212582,-0.125540,-0.006374,-0.137211,-0.340430,-0.153522,-0.066072,0.070228,-0.227471,...,0.050386,0.049299,-0.084035,0.050993,0.079178,-0.020585,0.008343,-0.067704,-0.048392,0.136856
25,0.603079,0.193966,0.129598,0.058535,0.096490,0.151210,0.040676,0.028973,-0.062568,0.111574,...,-0.098535,-0.066629,-0.010202,-0.030658,-0.077715,-0.047453,-0.023949,-0.065015,-0.034592,-0.043934
31,0.561635,0.334314,-0.441562,-0.045939,-0.322666,-0.620397,-0.232330,0.027120,-0.301959,-0.264307,...,0.387014,-0.160244,-0.092961,-0.323246,0.694963,-0.072178,-0.430289,0.143832,-0.088675,-0.028367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138473,-0.033246,-0.190148,-0.164248,-0.010779,-0.152819,-0.051578,-0.248384,-0.022840,-0.035341,-0.331146,...,-0.061970,-0.032496,-0.019770,0.008461,-0.077334,-0.036558,-0.007712,-0.026296,-0.036871,-0.062397
138475,-0.037187,-0.260867,-0.142554,0.141551,-0.194433,-0.288065,-0.083104,-0.101228,-0.265310,-0.201903,...,-0.103602,0.151870,0.113892,-0.046429,0.134127,-0.166052,0.054153,-0.404670,-0.230851,0.086476
138486,0.741429,0.630848,0.135314,-0.168485,0.114365,-0.292510,-0.147586,-0.166505,-0.000741,0.187590,...,0.015552,-0.064067,-0.089124,0.025788,-0.111316,0.024148,0.168728,0.177730,0.037084,-0.055694
138492,0.144258,-0.279232,0.008540,-0.042746,-0.056750,-0.142939,-0.069889,-0.112550,-0.061144,-0.233107,...,-0.103873,-0.090664,0.061946,-0.033947,-0.117633,-0.030497,-0.042782,-0.091051,-0.012912,-0.048513


In [11]:
# to avoid watched items

watchedm=raw_data.groupby('user')['item'].apply(list)

In [12]:
watchedm

user
11        [4643, 170, 531, 616, 2140, 2722, 2313, 2688, ...
14        [8961, 1396, 471, 2105, 1042, 1947, 1269, 2394...
18        [1952, 1283, 3507, 4280, 51084, 593, 318, 356,...
25        [261, 22, 2161, 3255, 372, 1093, 428, 175, 214...
31        [260, 1196, 1210, 7153, 4993, 5952, 1270, 5855...
                                ...                        
138473    [524, 3354, 1025, 6565, 69757, 2085, 32, 55282...
138475    [1639, 1673, 1148, 246, 2019, 1267, 1172, 1235...
138486    [2694, 1994, 2723, 441, 2288, 637, 2013, 2423,...
138492    [2115, 908, 58, 2700, 2599, 1500, 1358, 1288, ...
138493    [3174, 2872, 48780, 2662, 2840, 1566, 2857, 20...
Name: item, Length: 31360, dtype: object

In [34]:
from tqdm import tqdm

submission = dict()
submission = {'user': [], 'item': []}

for row in tqdm(pred_df.index):

    for _ in range(10):
        submission['user'].append(row)

    movies = pred_df.loc[row]

    itemp = []
    for movie, score in movies.sort_values(ascending=False).iteritems():
        if len(itemp) == 10:
            break
        else:
            if movie not in watchedm[row]:
                itemp.append(movie)

    submission['item'] += itemp

submission = pd.DataFrame(submission)
submission.sort_values('user', inplace=True)
submission.to_csv('submission.csv', index=False)

100%|██████████| 31360/31360 [00:43<00:00, 713.75it/s]
