### Importing libraries:

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

### Reading data:

In [2]:
movies = pd.read_csv("Rec_Sys/movies.csv")
ratings = pd.read_csv("Rec_Sys/ratings.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
df = pd.merge(ratings,movies,on='movieId',how='inner')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


### EDA:

- Filtering the dataset to deal with movies which has been rated by lot of people
- This eases the way of working with data intially.

In [8]:
agg_ratings = df.groupby('title').agg(mean_rating = ('rating', 'mean'),
                number_of_ratings = ('rating', 'count')).reset_index()

In [9]:
ratings_75 = agg_ratings[agg_ratings['number_of_ratings']>75]

In [10]:
ratings_75

Unnamed: 0,title,mean_rating,number_of_ratings
74,2001: A Space Odyssey (1968),3.894495,109
104,300 (2007),3.681250,80
207,Ace Ventura: Pet Detective (1994),3.040373,161
208,Ace Ventura: When Nature Calls (1995),2.727273,88
223,Addams Family Values (1993),3.101190,84
...,...,...,...
9433,Who Framed Roger Rabbit? (1988),3.572165,97
9485,Willy Wonka & the Chocolate Factory (1971),3.873950,119
9537,"Wizard of Oz, The (1939)",3.880435,92
9615,X-Men (2000),3.699248,133


In [11]:
ratings_75.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 230 entries, 74 to 9621
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              230 non-null    object 
 1   mean_rating        230 non-null    float64
 2   number_of_ratings  230 non-null    int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 7.2+ KB


In [12]:
ratings_75.sort_values(by='number_of_ratings',ascending=False).head()

Unnamed: 0,title,mean_rating,number_of_ratings
3158,Forrest Gump (1994),4.164134,329
7593,"Shawshank Redemption, The (1994)",4.429022,317
6865,Pulp Fiction (1994),4.197068,307
7680,"Silence of the Lambs, The (1991)",4.16129,279
5512,"Matrix, The (1999)",4.192446,278


- Combing the data of movies with more than 75 people ratings.

In [13]:
df_75 = pd.merge(df,ratings_75[['title']],on='title',how='inner')
df_75.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [14]:
df_75.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28113 entries, 0 to 28112
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     28113 non-null  int64  
 1   movieId    28113 non-null  int64  
 2   rating     28113 non-null  float64
 3   timestamp  28113 non-null  int64  
 4   title      28113 non-null  object 
 5   genres     28113 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 1.5+ MB


### Create user-item matrix:

In [15]:
ui_matrix = df_75.pivot_table(index='userId',columns='title',values='rating')
ui_matrix.head()

title,2001: A Space Odyssey (1968),300 (2007),Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Addams Family Values (1993),Airplane! (1980),Aladdin (1992),Alien (1979),Aliens (1986),Almost Famous (2000),...,WALL·E (2008),Waterworld (1995),What's Eating Gilbert Grape (1993),When Harry Met Sally... (1989),While You Were Sleeping (1995),Who Framed Roger Rabbit? (1988),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)",X-Men (2000),X2: X-Men United (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,4.0,,,...,,,,,,5.0,5.0,5.0,5.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,5.0,4.0,,,4.0,...,,,,,,,4.0,5.0,,
5,,,3.0,,3.0,,4.0,,,,...,,,,,,,,,,


- normalizing the data inorder to make them comparable
    - few movies have higher ratings and some movies arent rated.

In [16]:
matrix_norm = ui_matrix.subtract(ui_matrix.mean(axis=1), axis = 'rows')
matrix_norm.head()

title,2001: A Space Odyssey (1968),300 (2007),Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Addams Family Values (1993),Airplane! (1980),Aladdin (1992),Alien (1979),Aliens (1986),Almost Famous (2000),...,WALL·E (2008),Waterworld (1995),What's Eating Gilbert Grape (1993),When Harry Met Sally... (1989),While You Were Sleeping (1995),Who Framed Roger Rabbit? (1988),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)",X-Men (2000),X2: X-Men United (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,-0.381579,,,...,,,,,,0.618421,0.618421,0.618421,0.618421,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,1.509804,0.509804,,,0.509804,...,,,,,,,0.509804,1.509804,,
5,,,-0.5,,-0.5,,0.5,,,,...,,,,,,,,,,


### Finding similar users:
- we can either use
    - pearson correlation
    - cosine similarity

In [17]:
user_similarity = matrix_norm.T.corr()
user_similarity.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,,0.240346,0.180151,-0.439941,-0.180724,0.464277,0.904534,-0.037987,...,0.091574,0.254514,-0.050567,-0.5,0.050292,0.20287,0.165287,0.166715,-0.175412,-0.043875
2,,1.0,,,,,,,,-0.5,...,-0.559793,,-1.0,,,0.583333,,-0.229416,,0.73074
3,,,,,,,,,,,...,,,,,,,,,,
4,0.240346,,,1.0,-0.398485,0.390831,0.706784,0.063888,,0.541119,...,-0.348903,0.561435,0.142054,-0.158114,0.518328,-0.008023,0.053755,-0.339963,,-0.226816
5,0.180151,,,-0.398485,1.0,-0.013482,0.328889,0.028347,,-0.777714,...,0.0,0.209004,0.190885,0.063383,-0.290482,0.305477,0.228218,0.211898,0.384111,0.040582


In [18]:
# Pick a user ID
picked_userid = 4# Remove picked user ID from the candidate list
user_similarity.drop(index=picked_userid, inplace=True)# Take a look at the data
user_similarity.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,,0.240346,0.180151,-0.439941,-0.180724,0.464277,0.904534,-0.037987,...,0.091574,0.254514,-0.050567,-0.5,0.050292,0.20287,0.165287,0.166715,-0.175412,-0.043875
2,,1.0,,,,,,,,-0.5,...,-0.559793,,-1.0,,,0.583333,,-0.229416,,0.73074
3,,,,,,,,,,,...,,,,,,,,,,
5,0.180151,,,-0.398485,1.0,-0.013482,0.328889,0.028347,,-0.777714,...,0.0,0.209004,0.190885,0.063383,-0.290482,0.305477,0.228218,0.211898,0.384111,0.040582
6,-0.439941,,,0.390831,-0.013482,1.0,0.0,-0.144641,,0.957427,...,-0.29277,0.060433,-0.251331,-0.034298,0.066229,-0.247519,0.486395,-0.2189,0.193649,-0.035858


In [19]:
n=5 #finding similar number of user
user_threshold = 0.35 #user similarity threshold
similar_users = user_similarity[user_similarity[picked_userid]>user_threshold][picked_userid].sort_values(ascending=False)[:n]

In [20]:
print(f"the similar users for:{picked_userid}:\n{similar_users}")

the similar users for:4:
userId
162    1.0
299    1.0
544    1.0
568    1.0
473    1.0
Name: 4, dtype: float64


- removing the movies watched by the user
- keeping only the movies watched by similar users.

In [21]:
picked_userid_watched = matrix_norm[matrix_norm.index == picked_userid].dropna(axis=1, how='all')
picked_userid_watched

title,Airplane! (1980),Aladdin (1992),Almost Famous (2000),Amadeus (1984),American Beauty (1999),Austin Powers: International Man of Mystery (1997),Austin Powers: The Spy Who Shagged Me (1999),Beauty and the Beast (1991),Beetlejuice (1988),Being John Malkovich (1999),...,"Sixth Sense, The (1999)",Sleepless in Seattle (1993),Stand by Me (1986),Star Wars: Episode I - The Phantom Menace (1999),Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),There's Something About Mary (1998),Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.509804,0.509804,0.509804,0.509804,1.509804,0.509804,0.509804,-0.490196,1.509804,0.509804,...,0.509804,-2.490196,0.509804,-2.490196,1.509804,1.509804,-0.490196,-1.490196,0.509804,1.509804


In [22]:
similar_user_movies = matrix_norm[matrix_norm.index.isin(similar_users.index)].dropna(axis=1, how='all')
similar_user_movies

title,Alien (1979),Almost Famous (2000),American History X (1998),Apollo 13 (1995),Batman (1989),Batman Forever (1995),Broken Arrow (1996),Casino (1995),Dances with Wolves (1990),Donnie Darko (2001),...,"Shining, The (1980)","Silence of the Lambs, The (1991)",Star Trek: First Contact (1996),Star Wars: Episode I - The Phantom Menace (1999),Toy Story (1995),Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),"Usual Suspects, The (1995)",What's Eating Gilbert Grape (1993),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162,,,,-0.125,-0.125,-0.125,,0.875,-0.125,,...,,,,,,,,-0.125,-1.125,
299,-0.636364,,,,,,,,,,...,0.363636,,,-1.636364,,,,,,0.363636
473,,0.5,0.0,,,,,,,-0.5,...,,,,,,,,,,
544,,,,,,,-0.25,,,,...,,,0.75,,-1.25,-1.25,0.75,,,
568,,,,,,,,,,,...,,0.5,,,,,,0.5,,


In [23]:
similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
similar_user_movies

title,Alien (1979),American History X (1998),Apollo 13 (1995),Batman (1989),Batman Forever (1995),Broken Arrow (1996),Casino (1995),Dances with Wolves (1990),Donnie Darko (2001),Eternal Sunshine of the Spotless Mind (2004),...,Rain Man (1988),Schindler's List (1993),"Shawshank Redemption, The (1994)","Shining, The (1980)",Star Trek: First Contact (1996),Toy Story (1995),Twister (1996),"Usual Suspects, The (1995)",What's Eating Gilbert Grape (1993),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162,,,-0.125,-0.125,-0.125,,0.875,-0.125,,,...,,0.875,0.875,,,,,-0.125,-1.125,
299,-0.636364,,,,,,,,,,...,0.363636,,,0.363636,,,,,,0.363636
473,,0.0,,,,,,,-0.5,0.5,...,,,0.5,,,,,,,
544,,,,,,-0.25,,,,,...,,,,,0.75,-1.25,0.75,,,
568,,,,,,,,,,,...,,,,,,,,0.5,,


### Recommend Items:

In [24]:
item_score = {}
for i in similar_user_movies.columns:
  # Get the ratings for movie i
  movie_rating = similar_user_movies[i]
  # Creating a variable to store the score
  total = 0
  # Create a variable to store the number of scores
  count = 0
  # Loop through similar users
  for u in similar_users.index:
    # If the movie has rating
    if pd.isna(movie_rating[u]) == False:
      # Score is the sum of user similarity score multiply by the movie rating
      score = similar_users[u] * movie_rating[u]
      # Add the score to the total score for the movie so far
      total += score
      # Add 1 to the count
      count +=1
  # Get the average score for the item
  item_score[i] = total / count# Convert dictionary to pandas dataframe
item_score = pd.DataFrame(item_score.items(), columns=['movie', 'movie_score'])
    
# Sort the movies by score
ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)# Select top m movies
m = 10
ranked_item_score.head(m)

Unnamed: 0,movie,movie_score
13,Gladiator (2000),1.363636
6,Casino (1995),0.875
25,Schindler's List (1993),0.875
21,Mr. Holland's Opus (1995),0.8125
14,Independence Day (a.k.a. ID4) (1996),0.75
28,Star Trek: First Contact (1996),0.75
30,Twister (1996),0.75
26,"Shawshank Redemption, The (1994)",0.6875
10,Forrest Gump (1994),0.6875
9,Eternal Sunshine of the Spotless Mind (2004),0.5
