# Content Based Rec Sys

In [23]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [24]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
users = pd.read_csv('users.csv')

### Item-Item Similarity Baserd Rec

In [25]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [26]:
select_movies = ratings.movieId.value_counts().head(1000).index.to_list()
movies = movies.loc[movies.movieId.isin(select_movies)]
ratings = ratings.loc[ratings.movieId.isin(select_movies)]

In [27]:
m = movies.copy()
m['genres'] = m['genres'].str.split('|')
m = m.explode('genres')
m = m.pivot(index='movieId', columns='genres', values='title')
m = ~m.isna()
m = m.astype(int)

In [28]:
m.shape

(1000, 19)

In [29]:
m.head()

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [30]:
a = m.iloc[0].values
b = m.iloc[1].values

In [31]:
a

array([0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
b

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [33]:
def hamming_distance(x, y):
    return sum(abs(x-y))

In [34]:
ranks = []

for query in m.index[:10]:
    for candidate in m.index:
        if candidate == query:
            continue
        ranks.append([query, candidate, hamming_distance(m.loc[query], m.loc[candidate])])

In [35]:
ranks = pd.DataFrame(ranks, columns=['query', 'candidate', 'distance'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='query', right_on='movieId').rename(columns={'title': 'query_tittle'}).drop(columns=['movieId'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='candidate', right_on='movieId').rename(columns={'title': 'candidate_tittle'}).drop(columns=['movieId'])
ranks = ranks.sort_values(by=['query', 'distance'])
ranks.head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
5391,1,2294,0,Toy Story (1995),Antz (1998)
6651,1,3114,0,Toy Story (1995),Toy Story 2 (1999)
7921,1,4886,0,Toy Story (1995),"Monsters, Inc. (2001)"
1871,1,673,1,Toy Story (1995),Space Jam (1996)
5491,1,2355,1,Toy Story (1995),"Bug's Life, A (1998)"


In [36]:
ranks.query_tittle.unique()

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       'Father of the Bride Part II (1995)', 'Heat (1995)',
       'Sabrina (1995)', 'GoldenEye (1995)',
       'American President, The (1995)', 'Casino (1995)',
       'Sense and Sensibility (1995)'], dtype=object)

In [37]:
ranks.loc[ranks.query_tittle == 'Casino (1995)'].head(5)

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
179,16,36,0,Casino (1995),Dead Man Walking (1995)
729,16,247,0,Casino (1995),Heavenly Creatures (1994)
969,16,318,0,Casino (1995),"Shawshank Redemption, The (1994)"
1299,16,431,0,Casino (1995),Carlito's Way (1993)
2159,16,858,0,Casino (1995),"Godfather, The (1972)"


---
### User-User Similarity Based Rec Sys

In [38]:
users.head(2)

Unnamed: 0,userId,age,time_spent_per_day
0,1,16,3.976315
1,2,24,1.891303


In [39]:
r = ratings.copy()

In [40]:
r['hour'] = r['timestamp'].apply(lambda x: datetime.fromtimestamp(x).hour)
r.head()

Unnamed: 0,userId,movieId,rating,timestamp,hour
0,1,16,4.0,1217897793,6
1,1,24,1.5,1217895807,5
2,1,32,4.0,1217896246,6
3,1,47,4.0,1217896556,6
4,1,50,4.0,1217896523,6


In [41]:
r.groupby('userId').rating.mean().reset_index().head()

Unnamed: 0,userId,rating
0,1,3.691589
1,2,3.923077
2,3,3.806452
3,4,4.15942
4,5,2.864865


In [42]:
users = users.merge(r.groupby('userId').rating.mean().reset_index(), on='userId')
users = users.merge(r.groupby('userId').hour.mean().reset_index(), on='userId')

In [43]:
users.head()

Unnamed: 0,userId,age,time_spent_per_day,rating,hour
0,1,16,3.976315,3.691589,5.616822
1,2,24,1.891303,3.923077,21.0
2,3,20,4.521478,3.806452,14.370968
3,4,23,2.095284,4.15942,8.0
4,5,35,1.75986,2.864865,0.513514


In [44]:
u = users.copy()
u = u.set_index('userId')
u.columns = ['age', 'time_spent_per_day', 'u_avg_rating', 'hour']

In [45]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
u = pd.DataFrame(scaler.fit_transform(u), columns=u.columns, index=u.index)

In [46]:
def euclidian_distance(x, y):
    return np.linalg.norm(x-y)

In [47]:
userid = 5

In [48]:
dist = []
for user in u.index:
    dist.append(euclidian_distance(u.loc[userid], u.loc[user]))

u_rank = pd.DataFrame()
u_rank['id'] = u.index
u_rank['dist'] = dist
u_rank = u_rank.loc[u_rank.id != userid]
u_rank = u_rank.sort_values(by='dist')
u_rank.head()

Unnamed: 0,id,dist
213,214,1.400996
124,125,1.559669
301,302,1.641682
409,410,1.657114
25,26,1.676895


---

### Regression based Rec Sys

In [49]:
m.head(2)

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [50]:
u.head(2)

Unnamed: 0_level_0,age,time_spent_per_day,u_avg_rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-1.470292,0.341073,-0.073572,-0.882006
2,-0.135616,-1.079947,0.426461,1.477906


In [51]:
X = ratings[['movieId', 'userId', 'rating']].copy()  # base df 
X = X.merge(u.reset_index(), on='userId', how='right')
X = X.merge(m.reset_index(), on='movieId', how='right')
X.head()

Unnamed: 0,movieId,userId,rating,age,time_spent_per_day,u_avg_rating,hour,Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,2,5.0,-0.135616,-1.079947,0.426461,1.477906,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,5,4.0,1.699565,-1.169532,-1.859363,-1.664898,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,8,5.0,0.364888,0.298545,0.160605,1.324497,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,11,4.0,-1.303458,0.513712,-0.380602,0.557454,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,14,4.0,-0.30245,1.251552,-0.379415,0.557454,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [52]:
X = X.drop(columns = ['movieId', 'userId'])
y = X.pop('rating')

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [54]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [55]:
from sklearn.metrics import mean_squared_error as mse
mse(y_test, y_pred)**0.5

0.8868700141216211

---

# Collaborative Rec Sys - Matrix Factorisation

- collective matrix factorisation for recommender systems
- https://cmfrec.readthedocs.io/en/latest/

In [56]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [57]:
rm = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
rm.astype(int).head()

movieId,1,2,3,5,6,7,10,11,16,17,...,88125,89745,91529,96610,99114,109374,109487,111759,112852,116797
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,2,3,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,3,0,3,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
# ! pip install cmfrec? - surprise (post read)

In [59]:
rm_raw = ratings[['userId', 'movieId', 'rating']].copy()
rm_raw.columns = ['UserId', 'ItemId', 'Rating']  # Lib requires specific column names
rm_raw.head(2)

Unnamed: 0,UserId,ItemId,Rating
0,1,16,4.0
1,1,24,1.5


In [60]:
from cmfrec import CMF

model = CMF(k=3, lambda_=0.1, user_bias=False, item_bias=False, verbose=False)
model.fit(rm_raw)

Collective matrix factorization model
(explicit-feedback variant)


In [61]:
model.A_.shape

(668, 3)

In [62]:
model.B_.shape

(1000, 3)

In [63]:
model.A_

array([[-0.5165561 ,  0.4894137 ,  0.41367736],
       [ 0.04854456, -0.05557115,  0.8222439 ],
       [ 0.26768604,  0.48013276,  0.27049217],
       ...,
       [-0.13174073,  0.89367104, -0.04227696],
       [-0.04064005,  0.27655473,  0.31958127],
       [-0.7010689 ,  0.2712174 ,  0.06394046]], dtype=float32)

In [64]:
top_items = model.topN(user=4, n=10)
movies.loc[movies.movieId.isin(top_items)]

Unnamed: 0,movieId,title,genres
279,318,"Shawshank Redemption, The (1994)",Crime|Drama
626,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy
2183,2728,Spartacus (1960),Action|Drama|Romance|War
3885,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
4457,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
5105,7022,Battle Royale (Batoru rowaiaru) (2000),Action|Drama|Horror|Thriller
5206,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
7160,48516,"Departed, The (2006)",Crime|Drama|Thriller
7669,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
9908,109374,"Grand Budapest Hotel, The (2014)",Comedy|Drama


further read: https://nbviewer.org/github/david-cortes/cmfrec/blob/master/example/cmfrec_movielens_sideinfo.ipynb

In [65]:
rm__ = np.dot(model.A_, model.B_.T) + model.glob_mean_
mse(rm.values[rm > 0], rm__[rm > 0])**0.5

1.1196433994530783