<a href="https://colab.research.google.com/github/ankesh86/RecommendationSystems/blob/main/CollaborativeFilteringRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip -q install scikit-surprise

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import random
from IPython.display import Image

In [4]:
#KNN algorithm and csr_matrix for KNN data preparation
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics.pairwise import cosine_similarity

from surprise import Reader, Dataset

from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

from surprise.prediction_algorithms import CoClustering
from surprise.prediction_algorithms import NMF

#for rmse and MAE
from surprise import accuracy


# **Data collection**

In [7]:
data = pd.read_excel('sample_data/Rec_sys_data.xlsx')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850


In [8]:
data.shape

(272404, 9)

# **Data cleaning**

In [9]:
data.isnull().sum().sort_values(ascending=False)

InvoiceNo       0
StockCode       0
Quantity        0
InvoiceDate     0
DeliveryDate    0
Discount%       0
ShipMode        0
ShippingCost    0
CustomerID      0
dtype: int64

In [10]:
data1 = data.dropna()

In [11]:
data1.describe()

Unnamed: 0,InvoiceNo,Quantity,InvoiceDate,DeliveryDate,Discount%,ShippingCost,CustomerID
count,272404.0,272404.0,272404,272404,272404.0,272404.0,272404.0
mean,553740.733319,13.579536,2011-05-16 04:33:17.259658240,2011-05-18 04:33:04.572620288,0.300092,17.053491,15284.323523
min,536365.0,1.0,2010-12-01 08:26:00,2010-12-02 08:26:00,0.0,5.81,12346.0
25%,545312.0,2.0,2011-03-01 13:51:00,2011-03-03 14:53:00,0.15,5.81,13893.0
50%,553902.0,6.0,2011-05-19 18:02:00,2011-05-22 08:52:30,0.3,15.22,15157.0
75%,562457.0,12.0,2011-08-05 11:00:00,2011-08-07 12:05:00,0.45,30.12,16788.0
max,569629.0,74215.0,2011-10-05 11:37:00,2011-10-08 11:37:00,0.6,30.12,18287.0
std,9778.082879,149.136756,,,0.176023,10.01321,1714.478624


In [12]:
data1.StockCode = data1.StockCode.astype(str)

# **User-User collaborative filtering**

## Create data matrix covering purchase history

In [23]:
purchase_df = (data1.groupby(['CustomerID', 'StockCode'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('CustomerID'))
purchase_df.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
12350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0


In [24]:
def encode_units(x):
  if x<1:
    return 0 #not-purchased
  if x==1:
    return 1
  if x>1:
    return 1 #purchased

purchase_df = purchase_df.applymap(encode_units)
purchase_df.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [25]:
user_similarities = cosine_similarity(purchase_df)

user_similarity_data = pd.DataFrame(user_similarities, index=purchase_df.index, columns=purchase_df.index)
user_similarity_data.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.114708,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347,0.0,1.0,0.070632,0.053567,0.048324,0.0,0.029001,0.091885,0.075845,0.0,...,0.041739,0.0,0.050669,0.0,0.036811,0.069843,0.0,0.0,0.087667,0.021253
12348,0.0,0.070632,1.0,0.051709,0.031099,0.0,0.027995,0.118262,0.146427,0.061546,...,0.0,0.0,0.024456,0.0,0.0,0.0,0.0,0.0,0.123091,0.082061
12350,0.0,0.053567,0.051709,1.0,0.035377,0.0,0.0,0.0,0.033315,0.070014,...,0.0,0.0,0.027821,0.0,0.0,0.0,0.0,0.0,0.052511,0.0
12352,0.0,0.048324,0.031099,0.035377,1.0,0.0,0.095765,0.040456,0.10018,0.084215,...,0.110264,0.065233,0.133855,0.0,0.0,0.0,0.0,0.0,0.094742,0.056143


In [30]:

def fetch_similar_users(user_id,k=5):
    # separating df rows for the entered user id
    user = user_similarity_data[user_similarity_data.index == user_id]

    # a df of all other users
    other_users = user_similarity_data[user_similarity_data.index != user_id]

    # calc cosine similarity between user and each other user
    similarities = cosine_similarity(user,other_users)[0].tolist()

    # create list of indices of these users
    indices = other_users.index.tolist()

    # create key/values pairs of user index and their similarity
    index_similarity = dict(zip(indices, similarities))

    # sort by similarity
    index_similarity_sorted = sorted(index_similarity.items(),reverse=True)

    # grab k users off the top
    top_users_similarities = index_similarity_sorted[:k]
    users = [u[0] for u in top_users_similarities]
    print('The users with the behaviour similar to that of user {0} are: '.format(user_id))
    return users

In [34]:
similar_users = fetch_similar_users(12347)
similar_users

The users with the behaviour similar to that of user 12347 are: 


[18287, 18283, 18282, 18281, 18280]

### items bought by similar users

In [35]:
def similar_users_recommendation(userid):
  similar_users = fetch_similar_users(userid)

  #obtaining all items bought by similar users
  similar_users_recommendation_list = []
  for j in similar_users:
    item_list = data1[data1["CustomerID"]==j]['StockCode'].to_list()
    similar_users_recommendation_list.append(item_list)

  #this gives us multi-dimensional list
  flat_list = []
  for sublist in similar_users_recommendation_list:
    for item in sublist:
      flat_list.append(item)
  final_recommendations_list = list(dict.fromkeys(flat_list))

  # storing 10 random recommendation in a list
  ten_random_recommendations = random.sample(final_recommendations_list, 10)

  print('Items bought by Similar users based on Cosine Similarity')

  #returning 10 random recommendations
  return ten_random_recommendations

In [36]:
similar_users_recommendation(12347)

The users with the behaviour similar to that of user 12347 are: 
Items bought by Similar users based on Cosine Similarity


['22896',
 '23344',
 '22747',
 '22180',
 '84997A',
 '21181',
 '22495',
 '82581',
 '21156',
 '22553']

# **Item-to-Item Collaborative Filtering**

In [38]:
items_purchase_df = (data1.groupby(['StockCode','CustomerID'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('StockCode'))
items_purchase_df.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10123C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
#encoding
items_purchase_df = items_purchase_df.applymap(encode_units)

In [40]:
item_similarities = cosine_similarity(items_purchase_df)

In [41]:
item_similarity_data = pd.DataFrame(item_similarities, index=items_purchase_df.index, columns=items_purchase_df.index)
item_similarity_data.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.0,0.0,0.108821,0.091287,0.0,0.0,0.094281,0.062932,0.091902,0.110096,...,0.0,0.0,0.0,0.0,0.0,0.032275,0.0,0.079333,0.0,0.066986
10080,0.0,1.0,0.0,0.0,0.0,0.0,0.043033,0.028724,0.067116,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10120,0.108821,0.0,1.0,0.132453,0.0,0.0,0.068399,0.068483,0.026669,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076739,0.0,0.013885
10123C,0.091287,0.0,0.132453,1.0,0.0,0.0,0.172133,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,1.0,0.288675,0.074536,0.049752,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
def fetch_similar_items(item_id,k=10):
    # separating data rows of the selected item
    item_similarity = item_similarity_data[item_similarity_data.index == item_id]

    # a data of all other items
    other_items_similarities = item_similarity_data[item_similarity_data.index != item_id]

    # calculate cosine similarity between selected item with other items
    similarities = cosine_similarity(item_similarity,other_items_similarities)[0].tolist()

    # create list of indices of these items
    item_indices = other_items_similarities.index.tolist()

    # create key/values pairs of item index and their similarity
    index_similarity_pair = dict(zip(item_indices, similarities))

    # sort by similarity
    sorted_index_similarity_pair = sorted(index_similarity_pair.items())

    # grab k users off the top
    top_k_item_similarities = sorted_index_similarity_pair[:k]
    similar_items = [u[0] for u in top_k_item_similarities]
    print('Similar items based on purchase behaviour for item number {0} are: '.format(item_id))
    return similar_items


In [50]:
similar_items = fetch_similar_items('10002')
similar_items

Similar items based on purchase behaviour for item number 10002 are: 


['10080',
 '10120',
 '10123C',
 '10124A',
 '10124G',
 '10125',
 '10133',
 '10135',
 '11001',
 '15030']

In [51]:
def simular_item_recommendation(userid):

    simular_items_recommendation_list = []

    #obtaining all the similar items to items bought by user
    item_list = data1[data1["CustomerID"]==userid]['StockCode'].to_list()
    for item in item_list:
        similar_items = fetch_similar_items(item)
        simular_items_recommendation_list.append(item_list)

    #this gives us multi-dimensional list
    # we need to flatten it
    flat_list = []
    for sublist in simular_items_recommendation_list:
        for item in sublist:
            flat_list.append(item)
    final_recommendations_list = list(dict.fromkeys(flat_list))

    # storing 10 random recommendations in a list
    ten_random_recommendations = random.sample(final_recommendations_list, 10)

    print('Similar Items bought by our users based on Cosine Similarity')

    #returning 10 random recommendations
    return ten_random_recommendations

In [52]:
simular_item_recommendation(12347)

Similar items based on purchase behaviour for item number 21171 are: 
Similar items based on purchase behaviour for item number 84997C are: 
Similar items based on purchase behaviour for item number 22497 are: 
Similar items based on purchase behaviour for item number 20782 are: 
Similar items based on purchase behaviour for item number 22494 are: 
Similar items based on purchase behaviour for item number 21064 are: 
Similar items based on purchase behaviour for item number 22726 are: 
Similar items based on purchase behaviour for item number 22775 are: 
Similar items based on purchase behaviour for item number 22771 are: 
Similar items based on purchase behaviour for item number 84969 are: 
Similar items based on purchase behaviour for item number 22492 are: 
Similar items based on purchase behaviour for item number 85167B are: 
Similar items based on purchase behaviour for item number 22773 are: 
Similar items based on purchase behaviour for item number 85232D are: 
Similar items bas

['23316',
 '22992',
 '22423',
 '23175',
 '22196',
 '23171',
 '22212',
 '22727',
 '16008',
 '22422']