# Collaborative Filtering Recommendation System

## Import Modules

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics.pairwise import cosine_similarity


## Import Dataset

In [112]:
df1 = pd.read_csv(filepath_or_buffer="./Movie_data.csv", names=["user_id", "username", "item_id", "rating", "timestamp"])
df1["timestamp"] = pd.to_datetime(df1["timestamp"], unit="s")

df2 = pd.read_csv(filepath_or_buffer="./Movie_Id_Titles.csv")

join_df = df1.merge(df2, on="item_id", how="left")
join_df = join_df.drop(columns="item_id")

In [109]:
join_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   user_id    100003 non-null  int64         
 1   username   100003 non-null  object        
 2   rating     100003 non-null  int64         
 3   timestamp  100003 non-null  datetime64[ns]
 4   title      100003 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 3.8+ MB


In [110]:
join_df.isna().sum()

user_id      0
username     0
rating       0
timestamp    0
title        0
dtype: int64

In [111]:
join_df.head()

Unnamed: 0,user_id,username,rating,timestamp,title
0,0,Shawn Wilson,5,1997-12-04 15:55:49,Star Wars (1977)
1,0,Shawn Wilson,5,1997-12-04 15:55:49,"Empire Strikes Back, The (1980)"
2,0,Shawn Wilson,1,1997-12-04 15:55:49,Gone with the Wind (1939)
3,196,Bessie White,3,1997-12-04 15:55:49,Kolya (1996)
4,196,Bessie White,4,1997-12-04 16:11:03,Mrs. Doubtfire (1993)


## Explore Dataset

### Dimensions of the Dataset

In [63]:
rows, cols = join_df.shape
print(f"The shape of the dataset is: {rows} rows × {cols} columns")
print(f"The size of the dataset is: {join_df.size:,}")


The shape of the dataset is: 100003 rows × 5 columns
The size of the dataset is: 500,015


### Statistical Summary of the Dataset

In [67]:
join_df["rating"].describe()

count    100003.000000
mean          3.529864
std           1.125704
min           1.000000
25%           3.000000
50%           4.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

### Number of Ratings given by each User

In [114]:
join_df.groupby("username")["rating"].count().sort_values(ascending=False).head(10)

username
Anna Free           737
Jeanne Maldonado    685
Bradley Butler      636
Mary Cooks          540
Bobbie Reyes        518
June Miyamoto       493
Burt Jean           490
Richard Spelman     484
Ruth Shepherd       480
Charles Haskell     448
Name: rating, dtype: int64

In [118]:
join_df.groupby("username")["rating"].mean().sort_values(ascending=False)

username
Jamie Hinger      4.869565
Justin Snowden    4.833333
Edward Coleman    4.724138
Glenn Claycomb    4.703704
Lee Paterson      4.687500
                    ...   
Frances Hays      2.058036
Alice Bingle      2.050000
Frederick Abdo    1.985185
Anna Free         1.834464
Adela Vignola     1.491954
Name: rating, Length: 944, dtype: float64

### Number of Unique Movies and Users

In [92]:
n_movies = join_df.title.unique().shape[0]
n_users = join_df.user_id.unique().shape[0]

print(f"There are {n_movies} unique movies and {n_users} unique users.")

There are 1664 unique movies and 944 unique users.


## Create Interaction Matrix

In [123]:
interaction_df = (
    join_df
    .pivot_table(
        index="user_id",
        columns="title",
        values="rating",
        aggfunc="mean",   # or 'first', 'max', etc. if there are duplicates
        fill_value=0.0    # or np.nan if you want missing ratings explicitly
    )
)

interaction = interaction_df.to_numpy()
users = interaction_df.index.to_numpy()
movies = interaction_df.columns.to_numpy()

interaction_df

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
