# Model Building -> Collaborative Filtering

### Importing Data

In [10]:
import pandas as pd

train_data = pd.read_csv('./../data/train_data.csv', parse_dates=['Timestamp'])

In [11]:
print(train_data.info())
print(train_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83726 entries, 0 to 83725
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   UserId                83726 non-null  int64         
 1   ProductId             83726 non-null  int64         
 2   Rating                83726 non-null  float64       
 3   Timestamp             83726 non-null  datetime64[ns]
 4   user_interactions     83726 non-null  float64       
 5   product_interactions  83726 non-null  float64       
 6   Recency               83726 non-null  int64         
 7   User_Avg_Rating       83726 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(3)
memory usage: 5.1 MB
None
   UserId  ProductId  Rating  Timestamp  user_interactions  \
0    1880      17471     5.0 2014-05-27           0.029661   
1    8943      13578     1.0 2012-10-14           0.029661   
2    6579      12061     5.0 2012-07-07           0.

### Item-Based Collaborative Filtering

#### create user-item interaction matrix

In [12]:
user_item_matrix = train_data.pivot(index='UserId', columns='ProductId', values='Rating')
print(user_item_matrix.head())

ProductId  1      3      4      5      6      8      10     11     12     \
UserId                                                                     
0            NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
1            NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
2            NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
3            NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
4            NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   

ProductId  13     ...  29141  29142  29143  29144  29145  29146  29148  29149  \
UserId            ...                                                           
0            NaN  ...    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
1            NaN  ...    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
2            NaN  ...    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
3            NaN  ...    NaN    NaN    NaN    NaN    NaN    Na

#### Addressing Sparsity

In [13]:
user_item_matrix = user_item_matrix.fillna(0)
print(user_item_matrix.head())

ProductId  1      3      4      5      6      8      10     11     12     \
UserId                                                                     
0            0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1            0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2            0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3            0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4            0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

ProductId  13     ...  29141  29142  29143  29144  29145  29146  29148  29149  \
UserId            ...                                                           
0            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.

In [14]:
sparsity = 1.0 - (user_item_matrix.astype(bool).sum(axis=1).sum() / user_item_matrix.size)
print(f'Sparsity: {sparsity * 100:.2f}%')
print(user_item_matrix.shape)

Sparsity: 99.97%
(12419, 22957)


In [15]:
min_interactions = 1
user_item_matrix = user_item_matrix.loc[
  user_item_matrix.sum(axis=1) >= min_interactions,
  user_item_matrix.sum(axis=0) >= min_interactions
]
print(user_item_matrix.shape)

(12419, 22957)


#### compute similarity

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#use cosine similarity to compute item-item similarity matrix
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(
  item_similarity,
  index=user_item_matrix.columns,
  columns=user_item_matrix.columns
)

print(item_similarity_df.head())

ProductId  1      3      4      5      6      8      10     11     12     \
ProductId                                                                  
1            1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3            0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4            0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0    0.0   
5            0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0    0.0   
6            0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0    0.0   

ProductId  13     ...  29141  29142  29143  29144  29145  29146  29148  29149  \
ProductId         ...                                                           
1            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
5            0.0  ...    0.0    0.0    0.0    0.0    0.0    0.

#### Generate Recommendations

In [17]:
# creating function to recommend top-N items based on the highest similarity scores

def recommend_items(user_id, user_item_matrix, item_similarity_df, n=5):
  #fetch user's interactions data
  user_ratings =  user_item_matrix.loc[user_id]

  #calculate scores by multiplying ratings with item similarity
  scores = np.dot(user_ratings, item_similarity_df)
  scores_df = pd.Series(scores, index=item_similarity_df.index)

  #recommend top-N items the user hasn't interacted with
  recommend_items = scores_df[user_ratings == 0].sort_values(ascending=False).head(n)
  return recommend_items.index.tolist()

#testing for user_id = 0
print(recommend_items(0, user_item_matrix, item_similarity_df))


[17066, 26268, 21917, 8835, 19444]


### Exporting Data 

In [18]:
user_item_matrix.to_csv('./../data/user_item_matrix.csv', index=False)
item_similarity_df.to_pickle('./../data/item_similarity_matrix.pkl')