# Collaborative filtering : 

#### - Based on similarity between users 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
movies = pd.read_csv('movies.csv',
                     dtype={'movieId':'int32','title':'str','genres':'str'})

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings = pd.read_csv('ratings.csv',
                      dtype = {'movieID':'int32','userId':'int32','rating':'float32','timestamp':'int32'})

In [5]:
ratings = pd.read_csv('ratings.csv')

In [6]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int32 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int32(1), object(2)
memory usage: 190.4+ KB


## User Based 

In [8]:
movies_id = movies.drop('genres',axis=1)

In [9]:
ratings_user = ratings.drop('timestamp',axis=1)

In [10]:
user_based = ratings_user.merge(movies_id , on = 'movieId')

In [11]:
user_based

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)
...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997)
100832,610,160527,4.5,Sympathy for the Underdog (1971)
100833,610,160836,3.0,Hazard (2005)
100834,610,163937,3.5,Blair Witch (2016)


In [12]:
df = user_based.groupby('movieId').count()['rating'].reset_index().rename(columns={'rating':'Voting_Count'})

In [13]:
df

Unnamed: 0,movieId,Voting_Count
0,1,215
1,2,110
2,3,52
3,4,7
4,5,49
...,...,...
9719,193581,1
9720,193583,1
9721,193585,1
9722,193587,1


In [14]:
mean_rating = ratings_user.groupby('movieId')['rating'].mean().reset_index().rename(columns={'rating':'mean_rating'})

In [15]:
df = df.merge(mean_rating,on='movieId')

In [16]:
df

Unnamed: 0,movieId,Voting_Count,mean_rating
0,1,215,3.920930
1,2,110,3.431818
2,3,52,3.259615
3,4,7,2.357143
4,5,49,3.071429
...,...,...,...
9719,193581,1,4.000000
9720,193583,1,3.500000
9721,193585,1,3.500000
9722,193587,1,3.500000


In [17]:
user_based = user_based.merge(df,on = 'movieId')

In [18]:
user_based

Unnamed: 0,userId,movieId,rating,title,Voting_Count,mean_rating
0,1,1,4.0,Toy Story (1995),215,3.92093
1,5,1,4.0,Toy Story (1995),215,3.92093
2,7,1,4.5,Toy Story (1995),215,3.92093
3,15,1,2.5,Toy Story (1995),215,3.92093
4,17,1,4.5,Toy Story (1995),215,3.92093
...,...,...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997),1,2.50000
100832,610,160527,4.5,Sympathy for the Underdog (1971),1,4.50000
100833,610,160836,3.0,Hazard (2005),1,3.00000
100834,610,163937,3.5,Blair Witch (2016),1,3.50000


In [19]:
treshold = 100

In [20]:
user_based = user_based[user_based['Voting_Count']>treshold] # Not consider movies with num of votes < treshold

In [48]:
user_based.head(50)

Unnamed: 0,userId,movieId,rating,title,Voting_Count,mean_rating
0,1,1,4.0,Toy Story (1995),215,3.92093
1,5,1,4.0,Toy Story (1995),215,3.92093
2,7,1,4.5,Toy Story (1995),215,3.92093
3,15,1,2.5,Toy Story (1995),215,3.92093
4,17,1,4.5,Toy Story (1995),215,3.92093
5,18,1,3.5,Toy Story (1995),215,3.92093
6,19,1,4.0,Toy Story (1995),215,3.92093
7,21,1,3.5,Toy Story (1995),215,3.92093
8,27,1,3.0,Toy Story (1995),215,3.92093
9,31,1,5.0,Toy Story (1995),215,3.92093


In [52]:
user_based.groupby('title').mean().sort_values('Voting_Count',ascending=False).head(50)

Unnamed: 0_level_0,userId,movieId,rating,Voting_Count,mean_rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Forrest Gump (1994),308.161094,356.0,4.164134,329.0,4.164134
"Shawshank Redemption, The (1994)",302.299685,318.0,4.429022,317.0,4.429022
Pulp Fiction (1994),295.18241,296.0,4.197068,307.0,4.197068
"Silence of the Lambs, The (1991)",306.577061,593.0,4.16129,279.0,4.16129
"Matrix, The (1999)",306.604317,2571.0,4.192446,278.0,4.192446
Star Wars: Episode IV - A New Hope (1977),304.717131,260.0,4.231076,251.0,4.231076
Jurassic Park (1993),311.197479,480.0,3.75,238.0,3.75
Braveheart (1995),309.599156,110.0,4.031646,237.0,4.031646
Terminator 2: Judgment Day (1991),306.580357,589.0,3.970982,224.0,3.970982
Schindler's List (1993),315.822727,527.0,4.225,220.0,4.225


In [22]:
#user_based[user_based['title'] == 'Star Wars: Episode IV - A New Hope (1977)']

In [23]:
#list(user_based['title'].unique())

### Creating user-item matrix

In [24]:
user_movie_table = pd.pivot_table(user_based, index = 'userId', columns = 'title' , values = 'rating').fillna(0)

In [25]:
# Use .fillna(0) for Cosine Similarity 
# Don't use it for measuring correlations with df.corr() ::> Considers Non rated Movies as badly rated ones

In [26]:
user_movie_table

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,True Lies (1994),"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,4.0,0.0,0.0,5.0,5.0,0.0,4.0,...,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,5.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
5,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,0.0,0.0,4.0,3.5,4.5,4.5,4.0,1.0,4.5,...,0.0,4.5,4.0,0.0,0.0,4.5,4.0,0.0,0.0,0.0
607,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,3.0
608,3.0,3.5,3.0,4.0,4.5,0.0,5.0,4.0,2.5,3.0,...,3.0,4.5,3.5,3.0,0.0,4.5,0.0,3.0,3.5,4.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0


### Recommendations Based on correlations between movie vector  in userId Space : 

    - Compares a movie as a vector in userId space 

In [27]:
#user_movie_table['Aladdin (1992)'].values

In [28]:
pd.notnull(user_movie_table['Aladdin (1992)']).sum()

597

In [29]:
aladdin = pd.DataFrame(user_movie_table['Aladdin (1992)'].values , columns = ['rating'])

In [45]:
pd.DataFrame(user_movie_table.corr()['Aladdin (1992)']).sort_values('Aladdin (1992)',ascending=False).head(20)

Unnamed: 0_level_0,Aladdin (1992)
title,Unnamed: 1_level_1
Aladdin (1992),1.0
Beauty and the Beast (1991),0.659962
"Lion King, The (1994)",0.606561
True Lies (1994),0.440448
Ace Ventura: Pet Detective (1994),0.435291
Mrs. Doubtfire (1993),0.430947
Batman (1989),0.427146
Batman Forever (1995),0.42677
Jurassic Park (1993),0.424096
Die Hard: With a Vengeance (1995),0.421204


- Suggests The Movie With The Highest Correlation To Aladdin ...

## Cosine similarity 

#### - Use csr Matrix : 


   - A sparse matrix is a matrix in which most of its elements are zero or empty. In other words, it's a matrix where the majority of its entries have a specific value that indicates "absence" or "missing." Sparse matrices are commonly encountered in various fields, such as scientific computing, data analysis, and machine learning, where data is often incomplete or exhibits patterns of sparsity.

- In contrast, a dense matrix is one where most of its entries are non-zero and contain meaningful values. Dense matrices are the familiar kind of matrices you encounter in typical linear algebra computations.

   - Sparse matrices are important in practical applications for several reasons:

        - Efficient Storage: Since sparse matrices contain a lot of zeros or repeated values, storing them as dense matrices would be highly inefficient in terms of memory. Sparse matrix representations allow for more compact storage.

        - Computation Efficiency: Algorithms operating on sparse matrices can take advantage of their structure to perform computations more efficiently. This is especially important when dealing with large-scale data.

        - Speed of Operations: Many mathematical operations on sparse matrices can be accelerated using specialized algorithms that leverage the sparsity pattern. This can lead to faster calculations.

        - Real-World Data: Many real-world datasets exhibit sparsity. For example, user-item interaction matrices in recommendation systems, term-document matrices in natural language processing, and adjacency matrices in graph analysis often have a large number of zero entries.

   - To handle sparse matrices, various data structures and algorithms have been developed. Some common representations of sparse matrices include Compressed Sparse Row (CSR), Compressed Sparse Column (CSC), Coordinate List (COO), and Dictionary of Keys (DOK).

   - In the context of recommendation systems, collaborative filtering, and similar tasks, sparse matrices are frequently used to represent user-item interaction data, where the matrix entries correspond to user ratings or interactions with items. This allows for efficient storage and computation when dealing with large amounts of data and many missing values.

In [31]:
movie_user_table = pd.pivot_table(user_based, index = 'title', columns = 'userId' , values = 'rating').fillna(0)

In [32]:
movie_user_table

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0
Aladdin (1992),0.0,0.0,0.0,4.0,4.0,5.0,3.0,0.0,0.0,4.0,...,0.0,0.0,0.0,3.0,3.5,0.0,0.0,3.0,0.0,0.0
Alien (1979),4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,4.0,3.0,4.0,0.0,4.5
Aliens (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,3.5,0.0,4.5,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Usual Suspects, The (1995)",5.0,0.0,0.0,0.0,4.0,1.0,4.5,5.0,0.0,0.0,...,5.0,5.0,0.0,0.0,0.0,4.5,0.0,4.5,0.0,4.0
WALL·E (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.5
Waterworld (1995),0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,...,0.0,3.0,0.0,3.0,0.0,0.0,3.0,3.0,3.0,0.0
Willy Wonka & the Chocolate Factory (1971),5.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,4.0,0.0,0.0,3.5,0.0,0.0


In [33]:
from scipy.sparse import csr_matrix

In [34]:
movie_user_matrix = csr_matrix(movie_user_table.values)

In [35]:
#print(movie_user_matrix)

#### Use Nearest Neighbors with cosine as a metric to compare movies

##### Model 

In [36]:
from sklearn.neighbors import NearestNeighbors

In [37]:
movie_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

In [38]:
movie_model.fit(movie_user_matrix)

##### Results

In [53]:
similar_to = 'Godfather, The (1972)'

In [54]:
distances, indices = movie_model.kneighbors(movie_user_table.loc[similar_to].
                                            values.reshape(1, -1), n_neighbors = 6)

In [55]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(similar_to))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_user_table.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Godfather, The (1972):

1: Godfather: Part II, The (1974), with distance of 0.17822735875390605:
2: Goodfellas (1990), with distance of 0.3351590577912471:
3: One Flew Over the Cuckoo's Nest (1975), with distance of 0.3794637499875636:
4: Star Wars: Episode IV - A New Hope (1977), with distance of 0.4046831966011223:
5: Fargo (1996), with distance of 0.41138570937939367:


# LIMITATION  :

# Solution

###### Calculate the magnitude of each item vector
item_magnitudes = np.linalg.norm(user_item_matrix, axis=1)

###### Normalize the user-item matrix by dividing by the item magnitudes
normalized_user_item_matrix = user_item_matrix / item_magnitudes[:, np.newaxis]

###### Initialize NearestNeighbors with cosine similarity
nn = NearestNeighbors(n_neighbors=2, metric='cosine')
nn.fit(normalized_user_item_matrix)

###### Get the indices and distances of nearest neighbors for a specific movie (item)
query_movie_index = 0
distances, indices = nn.kneighbors([normalized_user_item_matrix[query_movie_index]])


### More info :
