[USER-USER Collaborative filtering Recommender System in Python](https://medium.com/@tomar.ankur287/user-user-collaborative-filtering-recommender-system-51f568489727)

In [1]:
import os
import tarfile
import urllib.request

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
arch_path, _ = urllib.request.urlretrieve('https://storage.yandexcloud.net/andre487-datasets/movies-dataset.tar.gz')

In [3]:
dataset_base_path = os.path.expanduser(os.path.join('~', 'Code', 'Notebooks', 'Data'))
dataset_path = os.path.join(dataset_base_path, 'movies-dataset')

os.makedirs(dataset_base_path, exist_ok=True)

with tarfile.open(arch_path, 'r:gz') as tar:
    tar.extractall(dataset_base_path)

In [4]:
df = pd.read_csv(os.path.join(dataset_path, 'ratings_small.csv'))

In [5]:
df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663
100003,671,6565,3.5,1074784724


In [6]:
df_wide = df.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

In [7]:
df_wide.tail()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
671,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train_idx, test_idx = train_test_split(df_wide.index, test_size=0.1)

In [9]:
user = [np.random.choice(test_idx)]
print(user)
other_users = train_idx

[593]


In [10]:
sim = cosine_similarity(df_wide.loc[user], df_wide.loc[other_users])

In [11]:
similarities = pd.DataFrame({
    'userId': other_users,
    'sim': sim[0],
})

In [12]:
similarities.sort_values('sim', ascending=False, inplace=True)

In [13]:
similarities.head()

Unnamed: 0,userId,sim
600,288,0.563509
122,161,0.558487
208,85,0.526747
542,151,0.51978
154,194,0.501969


In [14]:
df_user = df_wide.loc[user[0]]
df_user

movieId
1         0.0
2         3.0
3         0.0
4         0.0
5         0.0
         ... 
161944    0.0
162376    0.0
162542    0.0
162672    0.0
163949    0.0
Name: 593, Length: 9066, dtype: float64

In [15]:
movies_to_recommend = df_user[df_user == 0].index
movies_to_recommend

Int64Index([     1,      3,      4,      5,      6,      7,      8,      9,
                11,     12,
            ...
            161084, 161155, 161594, 161830, 161918, 161944, 162376, 162542,
            162672, 163949],
           dtype='int64', name='movieId', length=8996)

In [16]:
N_to_rec = 3
most_sim_user = df_wide.loc[similarities.iloc[0].userId]
most_sim_user = most_sim_user[most_sim_user != 0].sort_values(ascending=False)
most_sim_user

movieId
353    5.0
368    5.0
227    5.0
288    5.0
296    5.0
      ... 
592    2.0
432    2.0
474    2.0
44     1.0
434    1.0
Name: 288, Length: 75, dtype: float64

In [17]:
i = 0
print('Recommend:')
for movie_id in most_sim_user.index:
    if movie_id in movies_to_recommend:
        print(movie_id)
        i += 1
        if i == N_to_rec:
            break

Recommend:
368
204
527
