# Collaborative Filtering Recommendation System

## Task 1: Import Modules

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# print(pd.__version__)
# print(np.__version__)
# 1.5.1
# 1.23.4

## Task 2: Import the Dataset

In [3]:
# ! python --version
# ! streamlit --version

# Python 3.8.10
# Streamlit, version 1.14.0

In [4]:
columns=['User_ID', 'User_Name', 'Movie_ID', 'Ratings', 'Timestamp']
movie_data_df = pd.read_csv("Movie_data.csv", sep=",", names=columns)
# movie_data_df

columns = {'item_id': 'Movie_ID', 'title': 'Movie_title'}
movie_titles_df = pd.read_csv("Movie_Id_Titles.csv", sep=",")
movie_titles_df.rename(columns=columns, inplace=True)
# movie_titles_df

movies_df = pd.merge(movie_data_df, movie_titles_df, on="Movie_ID")
movies_df.head(5)

Unnamed: 0,User_ID,User_Name,Movie_ID,Ratings,Timestamp,Movie_title
0,0,Shawn Wilson,50,5,881250949,Star Wars (1977)
1,0,Shawn Wilson,172,5,881250949,"Empire Strikes Back, The (1980)"
2,0,Shawn Wilson,133,1,881250949,Gone with the Wind (1939)
3,196,Bessie White,242,3,881250949,Kolya (1996)
4,196,Bessie White,393,4,881251863,Mrs. Doubtfire (1993)


## Task 3: Explore the Dataset

In [5]:
movies_df.shape

(100003, 6)

In [6]:
movies_df.describe()

Unnamed: 0,User_ID,Movie_ID,Ratings,Timestamp
count,100003.0,100003.0,100003.0,100003.0
mean,462.470876,425.520914,3.529864,883528800.0
std,266.622454,330.797791,1.125704,5343791.0
min,0.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [7]:
# for user_id, user_group in movies_df.groupby("User_ID"):
#     print(user_id, list(user_group['Ratings']), user_group['Ratings'].count())
movies_df.groupby('User_ID')['Ratings'].count().sort_values(ascending=True)#.head()

User_ID
0        3
19      20
636     20
93      20
34      20
      ... 
276    518
450    540
13     636
655    685
405    737
Name: Ratings, Length: 944, dtype: int64

In [8]:
unique_users = pd.unique(movies_df['User_ID']).shape[0]
unique_movies = pd.unique(movies_df['Movie_ID']).shape[0]

print(unique_users, 'users,', unique_movies, 'movies')

944 users, 1682 movies


## Task 4: Create an Interaction Matrix

In [9]:
#   *   movies
# users ratings
ratings = np.zeros((unique_users, unique_movies))
for row in movies_df.itertuples():
    ratings[row[1], row[3]-1] = row[4]
ratings

array([[0., 0., 0., ..., 0., 0., 0.],
       [5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]], shape=(944, 1682))

## Task 5: Explore the Interaction Matrix

In [10]:
non_zero = len(ratings.nonzero())
total = ratings.shape[0] * ratings.shape[1]
sparsity = 1 - (non_zero/total)
sparsity*=100
sparsity

99.99987404018623

## Task 6 : Create a Similarity Matrix

In [11]:
cosine_similarity_ratings = cosine_similarity(ratings)
cosine_similarity_ratings

array([[1.        , 0.11988816, 0.11554032, ..., 0.        , 0.18180857,
        0.11890394],
       [0.11988816, 1.        , 0.16693098, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.11554032, 0.16693098, 1.        , ..., 0.16148478, 0.17226781,
        0.10579788],
       ...,
       [0.        , 0.14861694, 0.16148478, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.18180857, 0.17950788, 0.17226781, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.11890394, 0.39817474, 0.10579788, ..., 0.09511958, 0.18246466,
        1.        ]], shape=(944, 944))

## Task 7: Provide Recommendations

In [12]:
def movie_recommendations(ratings, cosine_similarity_ratings, user_id, k = 10, top = 10):
    
    user_similarities = cosine_similarity_ratings[user_id]
    top_k_simialr_users = ratings.index[user_similarities.argpartition(-k)[-k:]]
    mean_ratings_movies = ratings.loc[top_k_simialr_users].mean(0).sort_values(ascending=False)
    seen_movies = ratings.loc[user_id].gt(0)
    seen_movies = seen_movies.index[seen_movies].tolist()
    mean_ratings_movies = mean_ratings_movies.drop(seen_movies).head(top)
    recommended_movies = mean_ratings_movies.index.to_frame().reset_index(drop=True)
    recommended_movies.rename(columns={recommended_movies.columns[0]: 'Movie_ID'}, inplace=True)
    return recommended_movies


## Task 8: View the Provided Recommendations 

In [13]:
ratings_df = pd.DataFrame(ratings)

In [14]:
user_id = 100
movie_recommendations(ratings_df, cosine_similarity_ratings, user_id)


Unnamed: 0,Movie_ID
0,326
1,306
2,302
3,321
4,330
5,331
6,342
7,747
8,338
9,300


## Task 9: Create Wrapper Function

In [17]:
def movie_recommender_run(user_name):
    user_id = movies_df[movies_df['User_Name'] == user_name]['User_ID'].values[0]
    rec_movies = movie_recommendations(ratings_df, cosine_similarity_ratings, user_id)

    rec_movies = rec_movies.merge(movie_titles_df, how="inner")
    return rec_movies
movie_recommender_run('Bessie White')


Unnamed: 0,Movie_ID,Movie_title
0,99,Snow White and the Seven Dwarfs (1937)
1,434,Forbidden Planet (1956)
2,209,This Is Spinal Tap (1984)
3,215,Field of Dreams (1989)
4,167,Private Benjamin (1980)
5,203,Unforgiven (1992)
6,210,Indiana Jones and the Last Crusade (1989)
7,257,Men in Black (1997)
8,731,"Corrina, Corrina (1994)"
9,55,"Professional, The (1994)"


In [16]:
movies_df

Unnamed: 0,User_ID,User_Name,Movie_ID,Ratings,Timestamp,Movie_title
0,0,Shawn Wilson,50,5,881250949,Star Wars (1977)
1,0,Shawn Wilson,172,5,881250949,"Empire Strikes Back, The (1980)"
2,0,Shawn Wilson,133,1,881250949,Gone with the Wind (1939)
3,196,Bessie White,242,3,881250949,Kolya (1996)
4,196,Bessie White,393,4,881251863,Mrs. Doubtfire (1993)
...,...,...,...,...,...,...
99998,941,Thomas Warren,919,5,875048887,"City of Lost Children, The (1995)"
99999,941,Thomas Warren,273,3,875049038,Heat (1995)
100000,941,Thomas Warren,1,5,875049144,Toy Story (1995)
100001,941,Thomas Warren,294,4,875048532,Liar Liar (1997)
