In [6]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import cross_validate
from collections import defaultdict


# we load the first part of the dataset whice contain four column ["userId","movieId","rating","timestamp"]
# row : 100836, col : 4 
rating_data_set=pd.read_csv("ratings.csv")


In [7]:
# we load the second part of the dataset whice contain four column ["movieId","title","genres"]
# row : 9742, col : 3
movie_data_set=pd.read_csv("movies.csv")


In [8]:
#then we merge betwwen them in movieId column to get final dataset
# row : 100836, col : 6
final_dataset_org=pd.merge(rating_data_set,movie_data_set,on="movieId")

In [9]:
# we see here thre are no missing data in dataset 
final_dataset_org.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [10]:
# here i will no how many user id in my data so when do recommendation we ask 
#the costumer to enter the number of id of user to get recommendation of that user 
final_dataset_org["userId"].tail()

100831    610
100832    610
100833    610
100834    610
100835    610
Name: userId, dtype: int64

In [11]:
rating_data_set.shape

(100836, 4)

In [12]:
#we will use Surprise lib for recommender systems he provide 
# so the dataset must procese to get inside this lib
reader = Reader(rating_scale=(1, 5))

#we drop ["timestamp","title","genres"] becouse the lib only git It must have three columns,
#corresponding to the (raw) user ids, the item ids, and the ratings, in this order.
final_dataset_norg=final_dataset_org.drop(["timestamp","title","genres"],axis=1)

data_set = Dataset.load_from_df(final_dataset_norg, reader)


In [13]:
# we choose SVD in surprise it give small rmse 
svd = SVD(n_epochs=14,lr_all=0.002,reg_all=0.1,n_factors=5)



In [14]:
train_set = data_set.build_full_trainset()
svd_fit=svd.fit(train_set)

testset = train_set.build_testset()
predictions = svd.test(testset)

cross_validate(svd, data_set, measures=['RMSE', 'MAE'],cv=5)


{'test_rmse': array([0.88560129, 0.88833111, 0.8861241 , 0.89565596, 0.88616882]),
 'test_mae': array([0.68384459, 0.68934474, 0.68304561, 0.69086042, 0.68491081]),
 'fit_time': (1.4789798259735107,
  1.4854631423950195,
  1.3521897792816162,
  1.2402839660644531,
  1.507220983505249),
 'test_time': (0.16042208671569824,
  0.2620048522949219,
  0.19508886337280273,
  0.14255213737487793,
  0.31708693504333496)}

In [15]:
links_data_set=pd.read_csv("links.csv")

final_dataset_for_link=pd.merge(final_dataset_org,links_data_set,on="movieId")
final_dataset_for_link=final_dataset_for_link.drop(["userId","rating","timestamp","imdbId"],axis=1)

final_dataset_org=final_dataset_org.drop("timestamp",axis=1)




In [16]:
import pickle 
data= {"model":svd,"final_data_org":final_dataset_org,
       "final_data_norg":final_dataset_norg,"predictions":predictions,
       "final_dataset_for_link":final_dataset_for_link}
with open("saved_steps.pkl","wb") as file:
    pickle.dump(data,file)
