# **Install and Import Modules**

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
     ---------------------------------------- 0.0/772.0 kB ? eta -:--:--
     - ----------------------------------- 30.7/772.0 kB 660.6 kB/s eta 0:00:02
     --- --------------------------------- 71.7/772.0 kB 787.7 kB/s eta 0:00:01
     ----- ------------------------------ 122.9/772.0 kB 798.9 kB/s eta 0:00:01
     ------- ---------------------------- 153.6/772.0 kB 919.0 kB/s eta 0:00:01
     --------- -------------------------- 194.6/772.0 kB 787.7 kB/s eta 0:00:01
     ---------- ------------------------- 225.3/772.0 kB 808.4 kB/s eta 0:00:01
     ----------- ------------------------ 256.0/772.0 kB 827.5 kB/s eta 0:00:01
     ------------- ---------------------- 286.7/772.0 kB 803.7 kB/s eta 0:00:01
     --------------- -------------------- 337.9/772.0 kB 807.1 kB/s eta 0:00:01
     ----------------- ------------------ 368.6/772.0 kB 791.2 kB/s eta 0:00:01
     ------------------ ----------------- 399.4/


[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
from surprise import Reader, Dataset
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# **Load and Preprocess the Data**

In [3]:
ratings_df = pd.read_csv(r"ml-25m\ratings.csv")
movies_df = pd.read_csv(r"ml-25m\movies.csv")

df = pd.merge(ratings_df, movies_df[['movieId', 'genres']], on = 'movieId', how = 'left')
df

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,296,5.0,1147880044,Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Drama
2,1,307,5.0,1147868828,Drama
3,1,665,5.0,1147878820,Comedy|Drama|War
4,1,899,3.5,1147868510,Comedy|Musical|Romance
...,...,...,...,...,...
25000090,162541,50872,4.5,1240953372,Animation|Children|Drama
25000091,162541,55768,2.5,1240951998,Animation|Comedy
25000092,162541,56176,2.0,1240950697,Children|Comedy
25000093,162541,58559,4.0,1240953434,Action|Crime|Drama|IMAX


In [4]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

df['userId'] = user_encoder.fit_transform(df['userId'])
df['movieId'] = movie_encoder.fit_transform(df['movieId'])

df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres').str.split('|')), columns = mlb.classes_, index = df.index ))

In [5]:
df.drop(columns = "(no genres listed)", inplace = True)


In [6]:
df

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,292,5.0,1147880044,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1,0,302,3.5,1147868817,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,303,5.0,1147868828,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,654,5.0,1147878820,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,878,3.5,1147868510,0,0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25000090,162540,11359,4.5,1240953372,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25000091,162540,11925,2.5,1240951998,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
25000092,162540,11972,2.0,1240950697,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
25000093,162540,12216,4.0,1240953434,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


# **Build the Model with Collabrative filtering**

In [8]:
train_df, test_df = train_test_split(df, test_size = 0.2)

In [10]:
reader = Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [11]:
trainset

<surprise.trainset.Trainset at 0x22232815290>

In [13]:
model_svd = SVD()
model_svd.fit(trainset)
predictions_svd = model_svd.test(trainset.build_anti_testset())
accuracy.rmse(predictions_svd)

KeyboardInterrupt: 

# **Make Recommendations**

In [24]:
def get_n_rec(user_id, n=10):
  user_movies = df[df['userId'] == user_id]['movieId'].unique()
  all_movies = df['movieId'].unique()
  movies_to_predict = list(set(all_movies) - set(user_movies))

  user_movie_pairs = [(user_id, movie_id, 0) for movie_id in movies_to_predict]
  predictions_cf = model_svd.test(user_movie_pairs)

  top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est)[:n]

  for pred in top_n_recommendations:
    predicted_rating = pred.est
    print(predicted_rating)

  top_n_movie_ids = [int(pred.iid) for pred in top_n_recommendations]

  top_n_movies = movie_encoder.inverse_transform(top_n_movie_ids)

  return top_n_movies

### for userID 541

In [25]:
user_id = 541
recommendations = get_n_rec(user_id)
top_n_movies_titles = movies_df[movies_df['movieId'].isin(recommendations)]['title'].tolist()
print(f"Top 5 Recommendations for User {user_id}:")
for i, title in enumerate(top_n_movies_titles, 1):
  print(f"{i}.{title}")

1.5463819581287739
1.8023725684558956
1.8625275464904463
1.8697898880000998
1.8820046033566906
Top 5 Recommendations for User 221:
1.Stuart Saves His Family (1995)
2.Richie Rich (1994)
3.Honey, I Blew Up the Kid (1992)
4.Superman IV: The Quest for Peace (1987)
5.Battlefield Earth (2000)
