# Collaborative Filtering Recommender Systems

## Imports

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading Movie Data

In [32]:
movies_df = pd.read_csv("movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Preprocessing Movie Data

In [33]:
movies_df["year"] = movies_df.title.str.extract(r"(\(\d{4}\))", expand=False)
movies_df["year"] = movies_df.year.str.extract(r"(\d{4})", expand=False)

movies_df["title"] = movies_df.title.str.replace(r" (\(\d{4}\))", "", regex=True)

movies_df = movies_df.drop("genres", axis=1)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


## Reading Rating Data

In [34]:
ratings_df = pd.read_csv("ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


## Preprocessing Rating Data

In [35]:
ratings_df = ratings_df.drop("timestamp", axis=1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


<hr>

# User-Based Collaborative Filtering

## User Data

In [36]:
userInput = [
    {"title":"Breakfast Club, The", "rating": 5},
    {"title":"Toy Story", "rating": 3.5},
    {"title":"Jumanji", "rating": 2},
    {"title":"Pulp Fiction", "rating": 5},
    {"title":"Akira", "rating": 4.5}
]

input_movies = pd.DataFrame(userInput)
input_movies.head()

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


## Adding Movie ID to Input Data

In [37]:
input_movies_data = movies_df[movies_df.title.isin(input_movies.title.tolist())]
input_movies = pd.merge(input_movies_data, input_movies).drop("year", axis=1)

input_movies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


## Finding Similar Users Who Has Watched The Same Movies

In [38]:
ratings_subset = ratings_df[ratings_df.movieId.isin(input_movies.movieId.to_list())]
print(ratings_df.shape, ratings_subset.shape)
ratings_subset.head()

(22884377, 3) (196623, 3)


Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


## Grouping Rating Subsets by User Id

In [39]:
grouped_ratings_subset = ratings_subset.groupby(["userId"])

test_user_id = 75
grouped_ratings_subset.get_group(test_user_id)

  grouped_ratings_subset.get_group(test_user_id)


Unnamed: 0,userId,movieId,rating
7507,75,1,5.0
7508,75,2,3.5
7540,75,296,5.0
7633,75,1274,4.5
7673,75,1968,5.0


## Sorting The Groups (Users With The Same Watched Movies Will Be at Top)

In [40]:
sorted_grouped_ratings_subset = sorted(grouped_ratings_subset, key=(lambda x: len(x[1])), reverse=True)
sorted_grouped_ratings_subset = sorted_grouped_ratings_subset[:100]

sorted_grouped_ratings_subset[0]

((75,),
       userId  movieId  rating
 7507      75        1     5.0
 7508      75        2     3.5
 7540      75      296     5.0
 7633      75     1274     4.5
 7673      75     1968     5.0)

## Calculating Pearson Correlation

In [62]:
from math import sqrt

pearsonCorrelations = {}

for user_id, data in sorted_grouped_ratings_subset:
    user_id = user_id[0]  # (12,) --> 12

    data = data.sort_values(by="movieId")
    input_movies = input_movies.sort_values(by="movieId")
    
    input_same_movies_df = input_movies[input_movies.movieId.isin(data.movieId.tolist())]
    movies_count = len(input_same_movies_df)

    if movies_count == 0: continue

    input_same_movies_ratings = input_same_movies_df.rating.to_list()
    current_data_same_ratings = data.rating.to_list()
    # Calculating Pearson Correlation between 2 User Ratings
    #----------------------------------
    
    # Way 1:
    Sxy = sum(i*j for i, j in zip(input_same_movies_ratings, current_data_same_ratings)) - ((sum(input_same_movies_ratings)*sum(current_data_same_ratings)) / float(movies_count))
    Sxx = sum([r**2 for r in input_same_movies_ratings]) - ((sum(input_same_movies_ratings)**2) / float(movies_count))
    Syy = sum([r**2 for r in current_data_same_ratings]) - ((sum(current_data_same_ratings)**2) / float(movies_count))

    # Sxy / sqrt(Sxx * Syy)
    if Sxx != 0 and Syy != 0:
        pearsonCorrelations[user_id] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelations[user_id] = 0
    
    #----------------------------------

    # Way2:
    # pearsonCorrelations[user_id] = np.corrcoef(input_same_movies_ratings, current_data_same_ratings)[0]

    #----------------------------------

## Converting Correlation Data to DF

In [63]:
pearson_df = pd.DataFrame.from_dict(pearsonCorrelations, orient="index")
pearson_df.columns = ["similarity"]
pearson_df["userId"] = pearson_df.index
pearson_df.index = range(len(pearson_df))

pearson_df

Unnamed: 0,similarity,userId
0,0.827278,75
1,0.586009,106
2,0.832050,686
3,0.576557,815
4,0.943456,1040
...,...,...
95,0.537086,17854
96,0.877058,17897
97,0.271385,17944
98,0.298381,18301


## Sorting Pearson_DF (Descending)

In [64]:
sorted_pearson_df = pearson_df.sort_values(by="similarity", ascending=False)[:50]
sorted_pearson_df.head()

Unnamed: 0,similarity,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


## Merging the Ratings of selected users to all movies

In [68]:
users_ratings_df = sorted_pearson_df.merge(ratings_df, left_on="userId", right_on="userId", how="inner")
selected_users_ratings

Unnamed: 0,similarity,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5
...,...,...,...,...
47235,0.576557,815,146350,2.5
47236,0.576557,815,146656,3.5
47237,0.576557,815,148238,2.5
47238,0.576557,815,148626,3.5


<hr>

# Item-Based Collaborative Filtering