# Recommender system
## Collaborative filtering


In [1]:
# importing the neessary libs
import numpy as np
import pandas as pd
from scipy import sparse

In [2]:
def open_create_list(path, sep, column):
    """Function that opens a file and extract a specific column from it creating a list."""
    var = open(path, 'r', encoding='iso-8859-1')  
    new_list = []
    for l in var:
        l = l.split(sep)
        new_list.append(l[column])
    return new_list

### Creating a DataFrame with the important columns from the files

In [3]:
# Columns from u.data: | user id | item id | rating | timestamp |
userId_data = open_create_list('data/u.data', '\t', 0)
movieId_data = open_create_list('data/u.data', '\t', 1)
rating_data = open_create_list('data/u.data', '\t', 2)

# creating an array with the lists data
array_data = np.array([userId_data, movieId_data, rating_data])

In [7]:
# creating a dataframe with the data
data_df = pd.DataFrame(data = array_data)
data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,196,186,22,244,166,298,115,253,305,6,...,806,676,721,913,378,880,716,276,13,12
1,242,302,377,51,346,474,265,465,451,86,...,421,538,262,209,78,476,204,1090,225,203
2,3,3,1,2,1,4,2,5,3,3,...,4,4,3,2,3,3,5,1,2,3


In [8]:
# transposing the DF and setting the proper column name
data_df = data_df.T
data_df.columns = 'userID movieID rating'.split()
data_df.head()

Unnamed: 0,userID,movieID,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [10]:
# loading the unique movie information
# u.item first 3 columns: | movie id | movie title | release date |
movieId_uniq = open_create_list('data/u.item', '|', 0)
movieName_uniq = open_create_list('data/u.item', '|', 1)
movieDate_uniq = open_create_list('data/u.item', '|', 2)

array_item = np.array([movieId_uniq, movieName_uniq, movieDate_uniq])

In [12]:
# creating a dataframe with the unique movie information 
item_uniq_df = pd.DataFrame(data= array_item)
item_uniq_df = item_uniq_df.T
item_uniq_df.columns = 'movieID Name Date'.split()
item_uniq_df.head()

Unnamed: 0,movieID,Name,Date
0,1,Toy Story (1995),01-Jan-1995
1,2,GoldenEye (1995),01-Jan-1995
2,3,Four Rooms (1995),01-Jan-1995
3,4,Get Shorty (1995),01-Jan-1995
4,5,Copycat (1995),01-Jan-1995


## Mergind DataFrames
- Both dataframes `data_df` and `item_uniq_df` have the column movieID
- That means that the `merge` method will use this to combine the proper lines
- So, we actully add two new columns to the `data_df`: movie Name and movie Date
- Thus, the `data_and_item_df` if generated

In [13]:
data_and_item_df = pd.merge(data_df, item_uniq_df)
data_and_item_df.head()

Unnamed: 0,userID,movieID,rating,Name,Date
0,196,242,3,Kolya (1996),24-Jan-1997
1,63,242,3,Kolya (1996),24-Jan-1997
2,226,242,5,Kolya (1996),24-Jan-1997
3,154,242,3,Kolya (1996),24-Jan-1997
4,306,242,5,Kolya (1996),24-Jan-1997


In [14]:
# checking the shape of the final merged dataframe
data_and_item_df.shape

(100000, 5)

In [16]:
# converting the column "rating" from str ---> int
data_and_item_df['rating'] = pd.to_numeric(data_and_item_df['rating'])

In [17]:
# Checking if there's any NaN
data_and_item_df.isnull().sum()

userID     0
movieID    0
rating     0
Name       0
Date       0
dtype: int64

## Making a matrix of **user x item** (movie)
- Probably not every user has rated evey movie, so we will have some NaN now

In [19]:
matrix_user_item = data_and_item_df.pivot_table(index='userID', columns='Name', values='rating')
matrix_user_item.isnull().sum()

Name
'Til There Was You (1997)                934
1-900 (1994)                             938
101 Dalmatians (1996)                    834
12 Angry Men (1957)                      818
187 (1997)                               902
                                        ... 
Young Guns II (1990)                     899
Young Poisoner's Handbook, The (1995)    902
Zeus and Roxanne (1997)                  937
unknown                                  934
Á köldum klaka (Cold Fever) (1994)       942
Length: 1664, dtype: int64

- With that much NaN values we need to decide a threshold
- The dataset downloaded used as min_ratings = 20 for each user
- So, this would be a fair value to assume here as well
- using `dropna` we pass the ( thresh = 20 , axis = 1 )

In [22]:
matrix_user_item.dropna(thresh=20, axis=1, inplace=True)
matrix_user_item.head()

Name,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wishmaster (1997),With Honors (1994),"Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,5.0,,,3.0,4.0,,,,3.0,...,,,4.0,,5.0,,5.0,3.0,,
10,,5.0,,,,5.0,4.0,,,4.0,...,,,5.0,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
101,3.0,,,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,3.0,...,,,,,,,4.0,,,


In [23]:
# checking the new shape of the matrix df
matrix_user_item.shape

(943, 931)

### Now we can replace the NaN == 0 ...


In [26]:
matrix_user_item = matrix_user_item.fillna(0)

### ... and standardize the values
![image](https://365datascience.com/resources/blog/2018-10-image5-9-1024x591.jpg)

- the ratings are clearly unleveled so we can't compare them before normalizing
- each item has a different mean and number of ratings
- so, if we create a function that standardize the values from each line (user), we will be able to compare them



In [28]:
def standardize(line):
    """Standardize the values from a given line.
    The equation is the Z-score from a normal distribution."""
    new_line = (line - line.mean()) / (line.max() - line.min())
    return new_line

# Applying the function that standardize the values using the mean of each user
matrix_user_item_std = matrix_user_item.apply(standardize)
matrix_user_item_std

Name,101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",8 1/2 (1963),Absolute Power (1997),"Abyss, The (1989)",...,Wishmaster (1997),With Honors (1994),"Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)"
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.332768,0.884836,-0.026299,-0.063627,0.546554,0.581972,-0.050689,-0.030753,-0.090774,0.485048,...,-0.013998,-0.029905,0.587275,-0.038388,0.888229,-0.032874,0.832662,0.531283,-0.025875,-0.029056
10,-0.067232,0.884836,-0.026299,-0.063627,-0.053446,0.781972,0.749311,-0.030753,-0.090774,0.685048,...,-0.013998,-0.029905,0.787275,-0.038388,-0.111771,-0.032874,-0.167338,-0.068717,-0.025875,-0.029056
100,-0.067232,-0.115164,-0.026299,-0.063627,-0.053446,-0.218028,-0.050689,-0.030753,-0.090774,-0.114952,...,-0.013998,-0.029905,-0.212725,-0.038388,-0.111771,-0.032874,-0.167338,-0.068717,-0.025875,-0.029056
101,0.532768,-0.115164,-0.026299,-0.063627,-0.053446,-0.218028,-0.050689,-0.030753,-0.090774,-0.114952,...,-0.013998,-0.029905,-0.212725,-0.038388,-0.111771,-0.032874,-0.167338,-0.068717,-0.025875,-0.029056
102,-0.067232,-0.115164,-0.026299,-0.063627,-0.053446,-0.218028,-0.050689,-0.030753,-0.090774,0.485048,...,-0.013998,-0.029905,-0.212725,-0.038388,-0.111771,-0.032874,0.632662,-0.068717,-0.025875,-0.029056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.067232,0.884836,-0.026299,-0.063627,0.746554,0.381972,-0.050689,-0.030753,-0.090774,-0.114952,...,-0.013998,-0.029905,0.387275,-0.038388,-0.111771,-0.032874,0.632662,0.731283,0.374125,-0.029056
96,-0.067232,-0.115164,-0.026299,-0.063627,-0.053446,-0.218028,-0.050689,-0.030753,-0.090774,-0.114952,...,-0.013998,-0.029905,-0.212725,-0.038388,-0.111771,-0.032874,-0.167338,-0.068717,-0.025875,-0.029056
97,-0.067232,-0.115164,-0.026299,-0.063627,-0.053446,0.781972,-0.050689,-0.030753,-0.090774,-0.114952,...,-0.013998,-0.029905,0.787275,-0.038388,0.888229,-0.032874,0.832662,-0.068717,-0.025875,-0.029056
98,-0.067232,-0.115164,-0.026299,-0.063627,-0.053446,-0.218028,-0.050689,-0.030753,-0.090774,-0.114952,...,-0.013998,-0.029905,-0.212725,-0.038388,-0.111771,-0.032874,-0.167338,-0.068717,-0.025875,-0.029056


In [31]:
matrix_user_item_std.T

userID,1,10,100,101,102,103,104,105,106,107,...,94,940,941,942,943,95,96,97,98,99
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),0.332768,-0.067232,-0.067232,0.532768,-0.067232,-0.067232,-0.067232,-0.067232,-0.067232,-0.067232,...,0.532768,-0.067232,-0.067232,-0.067232,-0.067232,-0.067232,-0.067232,-0.067232,-0.067232,-0.067232
12 Angry Men (1957),0.884836,0.884836,-0.115164,-0.115164,-0.115164,-0.115164,-0.115164,-0.115164,-0.115164,-0.115164,...,-0.115164,-0.115164,-0.115164,-0.115164,-0.115164,0.884836,-0.115164,-0.115164,-0.115164,-0.115164
187 (1997),-0.026299,-0.026299,-0.026299,-0.026299,-0.026299,-0.026299,0.173701,-0.026299,-0.026299,-0.026299,...,-0.026299,-0.026299,-0.026299,-0.026299,-0.026299,-0.026299,-0.026299,-0.026299,-0.026299,-0.026299
2 Days in the Valley (1996),-0.063627,-0.063627,-0.063627,-0.063627,-0.063627,-0.063627,0.536373,-0.063627,-0.063627,-0.063627,...,0.736373,-0.063627,-0.063627,-0.063627,0.336373,-0.063627,-0.063627,-0.063627,-0.063627,-0.063627
"20,000 Leagues Under the Sea (1954)",0.546554,-0.053446,-0.053446,-0.053446,-0.053446,-0.053446,-0.053446,-0.053446,-0.053446,-0.053446,...,-0.053446,-0.053446,-0.053446,-0.053446,-0.053446,0.746554,-0.053446,-0.053446,-0.053446,-0.053446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wyatt Earp (1994),-0.032874,-0.032874,-0.032874,-0.032874,-0.032874,-0.032874,-0.032874,-0.032874,-0.032874,-0.032874,...,-0.032874,-0.032874,-0.032874,-0.032874,0.167126,-0.032874,-0.032874,-0.032874,-0.032874,-0.032874
Young Frankenstein (1974),0.832662,-0.167338,-0.167338,-0.167338,0.632662,-0.167338,-0.167338,-0.167338,-0.167338,-0.167338,...,0.632662,-0.167338,-0.167338,-0.167338,-0.167338,0.632662,-0.167338,0.832662,-0.167338,-0.167338
Young Guns (1988),0.531283,-0.068717,-0.068717,-0.068717,-0.068717,-0.068717,-0.068717,-0.068717,-0.068717,-0.068717,...,0.531283,-0.068717,-0.068717,-0.068717,0.731283,0.731283,-0.068717,-0.068717,-0.068717,0.731283
Young Guns II (1990),-0.025875,-0.025875,-0.025875,-0.025875,-0.025875,-0.025875,-0.025875,-0.025875,-0.025875,-0.025875,...,0.574125,-0.025875,-0.025875,-0.025875,0.574125,0.374125,-0.025875,-0.025875,-0.025875,-0.025875


## Cosine similarity
- after leveling up the users we can apply the:
### 1) ITEM-ITEM similarity


In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
# Since we are looking for the item-item similarity, we have to transpose the matrix in order to have the movies as lines
matrix_item_item_Cos_array = cosine_similarity(matrix_user_item_std.T)  #-----> the output is an array

# So we create a DataFrame
matrix_item_item_Cos = pd.DataFrame(data=matrix_item_item_Cos_array, 
                                    columns=matrix_user_item_std.columns, 
                                    index=matrix_user_item_std.columns)
matrix_item_item_Cos.shape

(931, 931)

- Since we had 931 movies at the preivous matrix, the shape is correct.
- This `matrix_item_item_Cos` relates the similarity between items, that means, how close is a movie to another movie.

## 2) USER-USER similarity

In [36]:
# Now we want user-user similarity, so we already have the users as lines
matrix_user_user_Cos_array = cosine_similarity(matrix_user_item_std)  #-----> the output is an array

# So we create a DataFrame
matrix_user_user_Cos_array = pd.DataFrame(data=matrix_user_user_Cos_array, 
                                    columns=matrix_user_item_std.index, 
                                    index=matrix_user_item_std.index)
matrix_user_user_Cos_array.shape

(943, 943)

- Now we see that the shape is correct (943 x 943) by relating users similarity