<a href="https://colab.research.google.com/github/ankesh86/RecommendationSystems/blob/main/CollaborativeFilteringRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install scikit-surprise

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import random
from IPython.display import Image

In [3]:
#KNN algorithm and csr_matrix for KNN data preparation
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics.pairwise import cosine_similarity

from surprise import Reader, Dataset

from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

from surprise.prediction_algorithms import CoClustering
from surprise.prediction_algorithms import NMF

#for rmse and MAE
from surprise import accuracy


# **Data collection**

In [4]:
data = pd.read_excel('sample_data/Rec_sys_data.xlsx')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850


In [5]:
data.shape

(272404, 9)

# **Data cleaning**

In [6]:
data.isnull().sum().sort_values(ascending=False)

InvoiceNo       0
StockCode       0
Quantity        0
InvoiceDate     0
DeliveryDate    0
Discount%       0
ShipMode        0
ShippingCost    0
CustomerID      0
dtype: int64

In [7]:
data1 = data.dropna()

In [8]:
data1.describe()

Unnamed: 0,InvoiceNo,Quantity,InvoiceDate,DeliveryDate,Discount%,ShippingCost,CustomerID
count,272404.0,272404.0,272404,272404,272404.0,272404.0,272404.0
mean,553740.733319,13.579536,2011-05-16 04:33:17.259658240,2011-05-18 04:33:04.572620288,0.300092,17.053491,15284.323523
min,536365.0,1.0,2010-12-01 08:26:00,2010-12-02 08:26:00,0.0,5.81,12346.0
25%,545312.0,2.0,2011-03-01 13:51:00,2011-03-03 14:53:00,0.15,5.81,13893.0
50%,553902.0,6.0,2011-05-19 18:02:00,2011-05-22 08:52:30,0.3,15.22,15157.0
75%,562457.0,12.0,2011-08-05 11:00:00,2011-08-07 12:05:00,0.45,30.12,16788.0
max,569629.0,74215.0,2011-10-05 11:37:00,2011-10-08 11:37:00,0.6,30.12,18287.0
std,9778.082879,149.136756,,,0.176023,10.01321,1714.478624


In [9]:
data1.StockCode = data1.StockCode.astype(str)

# **User-User collaborative filtering**

## Create data matrix covering purchase history

In [10]:
purchase_df = (data1.groupby(['CustomerID', 'StockCode'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('CustomerID'))
purchase_df.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
12350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0


In [11]:
def encode_units(x):
  if x<1:
    return 0 #not-purchased
  if x==1:
    return 1
  if x>1:
    return 1 #purchased

purchase_df = purchase_df.applymap(encode_units)
purchase_df.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [12]:
user_similarities = cosine_similarity(purchase_df)

user_similarity_data = pd.DataFrame(user_similarities, index=purchase_df.index, columns=purchase_df.index)
user_similarity_data.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.114708,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347,0.0,1.0,0.070632,0.053567,0.048324,0.0,0.029001,0.091885,0.075845,0.0,...,0.041739,0.0,0.050669,0.0,0.036811,0.069843,0.0,0.0,0.087667,0.021253
12348,0.0,0.070632,1.0,0.051709,0.031099,0.0,0.027995,0.118262,0.146427,0.061546,...,0.0,0.0,0.024456,0.0,0.0,0.0,0.0,0.0,0.123091,0.082061
12350,0.0,0.053567,0.051709,1.0,0.035377,0.0,0.0,0.0,0.033315,0.070014,...,0.0,0.0,0.027821,0.0,0.0,0.0,0.0,0.0,0.052511,0.0
12352,0.0,0.048324,0.031099,0.035377,1.0,0.0,0.095765,0.040456,0.10018,0.084215,...,0.110264,0.065233,0.133855,0.0,0.0,0.0,0.0,0.0,0.094742,0.056143


In [13]:

def fetch_similar_users(user_id,k=5):
    # separating df rows for the entered user id
    user = user_similarity_data[user_similarity_data.index == user_id]

    # a df of all other users
    other_users = user_similarity_data[user_similarity_data.index != user_id]

    # calc cosine similarity between user and each other user
    similarities = cosine_similarity(user,other_users)[0].tolist()

    # create list of indices of these users
    indices = other_users.index.tolist()

    # create key/values pairs of user index and their similarity
    index_similarity = dict(zip(indices, similarities))

    # sort by similarity
    index_similarity_sorted = sorted(index_similarity.items(),reverse=True)

    # grab k users off the top
    top_users_similarities = index_similarity_sorted[:k]
    users = [u[0] for u in top_users_similarities]
    print('The users with the behaviour similar to that of user {0} are: '.format(user_id))
    return users

In [14]:
similar_users = fetch_similar_users(12347)
similar_users

The users with the behaviour similar to that of user 12347 are: 


[18287, 18283, 18282, 18281, 18280]

### items bought by similar users

In [15]:
def similar_users_recommendation(userid):
  similar_users = fetch_similar_users(userid)

  #obtaining all items bought by similar users
  similar_users_recommendation_list = []
  for j in similar_users:
    item_list = data1[data1["CustomerID"]==j]['StockCode'].to_list()
    similar_users_recommendation_list.append(item_list)

  #this gives us multi-dimensional list
  flat_list = []
  for sublist in similar_users_recommendation_list:
    for item in sublist:
      flat_list.append(item)
  final_recommendations_list = list(dict.fromkeys(flat_list))

  # storing 10 random recommendation in a list
  ten_random_recommendations = random.sample(final_recommendations_list, 10)

  print('Items bought by Similar users based on Cosine Similarity')

  #returning 10 random recommendations
  return ten_random_recommendations

In [16]:
similar_users_recommendation(12347)

The users with the behaviour similar to that of user 12347 are: 
Items bought by Similar users based on Cosine Similarity


['23295',
 '23236',
 '22424',
 '22352',
 '22611',
 '22583',
 '21155',
 '22386',
 '23187',
 '21614']

# **Item-to-Item Collaborative Filtering**

In [17]:
items_purchase_df = (data1.groupby(['StockCode','CustomerID'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('StockCode'))
items_purchase_df.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10123C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
#encoding
items_purchase_df = items_purchase_df.applymap(encode_units)

In [19]:
item_similarities = cosine_similarity(items_purchase_df)

In [20]:
item_similarity_data = pd.DataFrame(item_similarities, index=items_purchase_df.index, columns=items_purchase_df.index)
item_similarity_data.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.0,0.0,0.108821,0.091287,0.0,0.0,0.094281,0.062932,0.091902,0.110096,...,0.0,0.0,0.0,0.0,0.0,0.032275,0.0,0.079333,0.0,0.066986
10080,0.0,1.0,0.0,0.0,0.0,0.0,0.043033,0.028724,0.067116,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10120,0.108821,0.0,1.0,0.132453,0.0,0.0,0.068399,0.068483,0.026669,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076739,0.0,0.013885
10123C,0.091287,0.0,0.132453,1.0,0.0,0.0,0.172133,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,1.0,0.288675,0.074536,0.049752,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
def fetch_similar_items(item_id,k=10):
    # separating data rows of the selected item
    item_similarity = item_similarity_data[item_similarity_data.index == item_id]

    # a data of all other items
    other_items_similarities = item_similarity_data[item_similarity_data.index != item_id]

    # calculate cosine similarity between selected item with other items
    similarities = cosine_similarity(item_similarity,other_items_similarities)[0].tolist()

    # create list of indices of these items
    item_indices = other_items_similarities.index.tolist()

    # create key/values pairs of item index and their similarity
    index_similarity_pair = dict(zip(item_indices, similarities))

    # sort by similarity
    sorted_index_similarity_pair = sorted(index_similarity_pair.items())

    # grab k users off the top
    top_k_item_similarities = sorted_index_similarity_pair[:k]
    similar_items = [u[0] for u in top_k_item_similarities]
    print('Similar items based on purchase behaviour for item number {0} are: '.format(item_id))
    return similar_items


In [22]:
similar_items = fetch_similar_items('10002')
similar_items

Similar items based on purchase behaviour for item number 10002 are: 


['10080',
 '10120',
 '10123C',
 '10124A',
 '10124G',
 '10125',
 '10133',
 '10135',
 '11001',
 '15030']

In [23]:
def simular_item_recommendation(userid):

    simular_items_recommendation_list = []

    #obtaining all the similar items to items bought by user
    item_list = data1[data1["CustomerID"]==userid]['StockCode'].to_list()
    for item in item_list:
        similar_items = fetch_similar_items(item)
        simular_items_recommendation_list.append(item_list)

    #this gives us multi-dimensional list
    # we need to flatten it
    flat_list = []
    for sublist in simular_items_recommendation_list:
        for item in sublist:
            flat_list.append(item)
    final_recommendations_list = list(dict.fromkeys(flat_list))

    # storing 10 random recommendations in a list
    ten_random_recommendations = random.sample(final_recommendations_list, 10)

    print('Similar Items bought by our users based on Cosine Similarity')

    #returning 10 random recommendations
    return ten_random_recommendations

In [24]:
simular_item_recommendation(12347)

Similar items based on purchase behaviour for item number 21171 are: 
Similar items based on purchase behaviour for item number 84997C are: 
Similar items based on purchase behaviour for item number 22497 are: 
Similar items based on purchase behaviour for item number 20782 are: 
Similar items based on purchase behaviour for item number 22494 are: 
Similar items based on purchase behaviour for item number 21064 are: 
Similar items based on purchase behaviour for item number 22726 are: 
Similar items based on purchase behaviour for item number 22775 are: 
Similar items based on purchase behaviour for item number 22771 are: 
Similar items based on purchase behaviour for item number 84969 are: 
Similar items based on purchase behaviour for item number 22492 are: 
Similar items based on purchase behaviour for item number 85167B are: 
Similar items based on purchase behaviour for item number 22773 are: 
Similar items based on purchase behaviour for item number 85232D are: 
Similar items bas

['22196',
 '22728',
 '21035',
 '21041',
 '85232D',
 '21154',
 '84625C',
 '23146',
 '22821',
 '22729']

# **KNN algorithm**

In [25]:
#before passing our sparse matrix (purchase_df) into KNN, it must be converted into a CSR matrix
#CSR divides a sparse matrix into three seperate arrays
#values
#extent of rows
#index of columns

purchase_matrix = csr_matrix(purchase_df.values)

knn_model = NearestNeighbors(metric='euclidean', algorithm='brute')

knn_model.fit(purchase_matrix)

In [26]:
def fetch_similar_users_knn(purchase_df, query_index):

  #creating empty list where we will store user id of similar users
  similar_users_knn = []

  #storing the distances and index of nearest neighbour
  distances, indices = knn_model.kneighbors(purchase_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors=5)
  for i in range(0, len(distances.flatten())):
    if i==0:
      print('Recommendations for {0}:\n'.format(purchase_df.index[query_index]))
    else:
      print('{0}: {1}, with distance of {2}:'.format(i, purchase_df.index[indices.flatten()[i]], distances.flatten()[i]))

      similar_users_knn.append(purchase_df.index[indices.flatten()[i]])
  return similar_users_knn

In [27]:
similar_users_knn = fetch_similar_users_knn(purchase_df, 1497)
similar_users_knn

Recommendations for 14729:

1: 16917, with distance of 8.12403840463596:
2: 16989, with distance of 8.12403840463596:
3: 15124, with distance of 8.12403840463596:
4: 12897, with distance of 8.246211251235321:


[16917, 16989, 15124, 12897]

In [28]:
def knn_recommendation(similar_users_knn):

  #obtaining all the items bought by similar users
  knn_recommendations = []

  for j in similar_users_knn:
    item_list = data1[data1["CustomerID"]==j]['StockCode'].to_list()
    knn_recommendations.append(item_list)

  #this gives us multi-dimentional list
  flat_list = []
  for sublist in knn_recommendations:
    for item in sublist:
      flat_list.append(item)
  final_recommendations_list = list(dict.fromkeys(flat_list))

  #storing 10 random recommendations in a list
  ten_random_recommendations = random.sample(final_recommendations_list, 10)

  print('Items bought by similar users based on KNN')

  #returning 10 random recommendations
  return ten_random_recommendations

In [29]:
knn_recommendation(similar_users_knn)

Items bought by similar users based on KNN


['22917',
 '22469',
 '22918',
 '22605',
 '84978',
 '22487',
 '22470',
 '22919',
 '22920',
 '22921']

# **Collaborative Filtering using Matrix Factorization**

In [30]:
data1.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850


In [31]:
items_purchase_df.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10120,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10123C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10124A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
data3 = items_purchase_df.stack().to_frame()

data3 = data3.reset_index().rename(columns={0:"Quantity"})
data3

Unnamed: 0,StockCode,CustomerID,Quantity
0,10002,12346,0
1,10002,12347,0
2,10002,12348,0
3,10002,12350,0
4,10002,12352,0
...,...,...,...
12903081,POST,18280,0
12903082,POST,18281,0
12903083,POST,18282,0
12903084,POST,18283,0


In [33]:
print(items_purchase_df.shape)
print(data3.shape)

(3538, 3647)
(12903086, 3)


In [34]:
#storing all customer ids in customers
customer_ids = data1['CustomerID']

item_ids = data1['StockCode']

In [42]:
from collections import Counter

#counting number of orders made by each customer
count_orders = Counter(customer_ids)

#storing the count and customer id in a df
customer_count_df = pd.DataFrame.from_dict(count_orders, orient='index').reset_index().rename(columns={0:"Quantity"})

# dropping all customer ids with less than 120 orders
customer_count_df = customer_count_df[customer_count_df["Quantity"]>120]

# renaming the index column as CustomerID for inner join
customer_count_df.rename(columns={'index':'CustomerID'},inplace=True)

In [43]:
customer_count_df.rename(columns = {'index':'CustomerID'}, inplace=True)
customer_count_df

Unnamed: 0,CustomerID,Quantity
0,17850,297
1,13047,140
2,12583,182
6,14688,265
8,15311,1892
...,...,...
3308,14096,1170
3367,16910,261
3392,16360,226
3413,17728,133


In [44]:
#counting no. of times an item was ordered
count_items = Counter(item_ids)

item_count_df = pd.DataFrame.from_dict(count_items, orient='index').reset_index().rename(columns={0:"Quantity"})

item_count_df = item_count_df[item_count_df["Quantity"]>120]

item_count_df.rename(columns={'index':'StockCode'}, inplace=True)
item_count_df

Unnamed: 0,StockCode,Quantity
0,84029E,161
1,71053,220
3,84406B,213
4,22752,229
5,85123A,1606
...,...,...
3295,23294,181
3296,23295,213
3363,23328,129
3373,23356,148


In [45]:
data4 = pd.merge(data3, item_count_df, on='StockCode', how='inner')
data4 = pd.merge(data4, customer_count_df, on='CustomerID', how='inner')
data4

Unnamed: 0,StockCode,CustomerID,Quantity_x,Quantity_y,Quantity
0,10133,12347,0,124,124
1,15036,12347,0,278,124
2,15056BL,12347,0,223,124
3,15056N,12347,0,325,124
4,16156S,12347,0,137,124
...,...,...,...,...,...
385667,85132C,18283,0,127,447
385668,85150,18283,1,264,447
385669,85152,18283,1,466,447
385670,M,18283,1,198,447


In [46]:
# dropping columns which are not necessary
data4.drop(['Quantity_x','Quantity'],axis=1,inplace=True)
data4.rename(columns={'Quantity_y': 'Quantity'}, inplace=True)

data4

Unnamed: 0,StockCode,CustomerID,Quantity
0,10133,12347,124
1,15036,12347,278
2,15056BL,12347,223
3,15056N,12347,325
4,16156S,12347,137
...,...,...,...
385667,85132C,18283,127
385668,85150,18283,264
385669,85152,18283,466
385670,M,18283,198


In [47]:
data4.describe()

Unnamed: 0,CustomerID,Quantity
count,385672.0,385672.0
mean,15360.985915,263.858616
std,1719.468125,173.130031
min,12347.0,121.0
25%,13996.25,156.0
50%,15413.0,207.0
75%,16840.0,299.0
max,18283.0,1606.0


### **Changing it to convert it into format supported by surprise library**

In [48]:
reader = Reader(rating_scale=(0,5095))
formated_data = Dataset.load_from_df(data4, reader)

### **Test-train split**

In [49]:
train_set, test_set = train_test_split(formated_data, test_size=0.2)

## **Implementing non-negative matrix factorization (NMF)**

In [50]:
#defining model
algo1 = NMF()

#model fitting
algo1.fit(train_set)

#model prediction
pred1 = algo1.test(test_set)

In [51]:
#RMSE
accuracy.rmse(pred1)


RMSE: 306.6645


306.66453067392393

In [52]:
#MAE
accuracy.mae(pred1)

MAE:  256.5153


256.51526011960857

In [53]:
cross_validate(algo1, formated_data, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    305.1789306.4275308.4641307.7723308.3027307.22911.2505  
MAE (testset)     255.8097256.7288257.5194257.4245256.8738256.87130.6121  
Fit time          9.55    8.53    9.18    7.72    8.53    8.70    0.63    
Test time         0.42    0.56    0.85    0.65    0.63    0.62    0.14    


{'test_rmse': array([305.17889474, 306.42748403, 308.46411083, 307.77228369,
        308.30274441]),
 'test_mae': array([255.80974729, 256.728836  , 257.51944731, 257.42445795,
        256.87377812]),
 'fit_time': (9.554152488708496,
  8.528066158294678,
  9.178609371185303,
  7.7181618213653564,
  8.530353546142578),
 'test_time': (0.42259836196899414,
  0.5613608360290527,
  0.8458287715911865,
  0.6483068466186523,
  0.6262702941894531)}

## **Implementing Co-clustering**

Co-clustering (also known as bi-clustering) is commonly used in collaborative filtering.
It is a data-mining technique that simultaneously clusters the columns and rows of a DataFrame/matrix. It differs from normal clustering, where each object is checked for similarity with other objects based on a single entity/type of comparison. As in co-clustering, you check for co-grouping of two different entities/types of comparison for each object simultaneously as a pairwise interaction.

In [54]:
#defining the model
algo2 = CoClustering()

#model fitting
algo2.fit(train_set)

#model prediction
pred2 = algo2.test(test_set)

In [55]:
#RMSE
accuracy.rmse(pred2)

RMSE: 3.4657


3.4656604195773144

In [56]:
#MAE
accuracy.mae(pred2)

MAE:  2.7400


2.739975947698709

In [57]:
cross_validate(algo2, formated_data, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4315  3.3867  3.3980  3.2424  3.5216  3.3960  0.0902  
MAE (testset)     2.7685  2.6612  2.6944  2.5625  2.7800  2.6933  0.0791  
Fit time          8.37    8.06    8.45    8.38    7.84    8.22    0.23    
Test time         0.61    0.63    0.65    0.62    0.35    0.57    0.11    


{'test_rmse': array([3.43154724, 3.38668264, 3.39795288, 3.24237815, 3.52156079]),
 'test_mae': array([2.76853701, 2.6611624 , 2.6944261 , 2.56247872, 2.77996929]),
 'fit_time': (8.37226915359497,
  8.05650281906128,
  8.451234102249146,
  8.375350952148438,
  7.8407862186431885),
 'test_time': (0.6147904396057129,
  0.6335086822509766,
  0.6534829139709473,
  0.6155867576599121,
  0.3517436981201172)}

## **Implementing SVD**

 Singular value decomposition is a linear algebra concept generally used as a
dimensionality reduction method. It is also a type of matrix factorization. It works similarly in collaborative filtering, where a matrix with rows and columns as users and items is reduced further into latent feature matrixes. An error equation is minimized to get to the prediction.

In [59]:
#defining the model
from surprise import SVD
algo3 = SVD()

#model fitting
algo3.fit(train_set)

#model_prediction
pred3 = algo3.test(test_set)

In [60]:
#RMSE
accuracy.rmse(pred3)

RMSE: 4834.6170


4834.61698761993

In [61]:
#MAE
accuracy.mae(pred3)

MAE:  4831.5347


4831.5347248330845

In [62]:
cross_validate(algo3, formated_data, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4834.43614834.16974833.74704834.40844834.45164834.24250.2682  
MAE (testset)     4831.33194831.07504830.59054831.30014831.40944831.14140.2971  
Fit time          6.10    6.37    5.43    6.29    5.88    6.01    0.34    
Test time         0.81    0.79    0.42    0.79    1.16    0.79    0.24    


{'test_rmse': array([4834.43608825, 4834.16968419, 4833.74695568, 4834.40838869,
        4834.45158081]),
 'test_mae': array([4831.33187269, 4831.07495949, 4830.5905178 , 4831.30014002,
        4831.40943034]),
 'fit_time': (6.099200248718262,
  6.366933107376099,
  5.43262505531311,
  6.2944254875183105,
  5.880600690841675),
 'test_time': (0.8106651306152344,
  0.79245924949646,
  0.41626572608947754,
  0.7864019870758057,
  1.1639022827148438)}

# **Printing recommendation**

In [63]:
data1[(data1['StockCode']=='47590B') & (data1['CustomerID']==15738)].Quantity.sum()

78

In [68]:
algo2.test([['47590B',15738,78]])

[Prediction(uid='47590B', iid=15738, r_ui=78, est=307.57207442843344, details={'was_impossible': False})]

In [69]:
pred2

[Prediction(uid='22720', iid=15356, r_ui=889.0, est=889.0775937761407, details={'was_impossible': False}),
 Prediction(uid='22997', iid=13458, r_ui=133.0, est=133.09196136507342, details={'was_impossible': False}),
 Prediction(uid='22348', iid=17389, r_ui=190.0, est=189.48914497587026, details={'was_impossible': False}),
 Prediction(uid='23172', iid=16360, r_ui=178.0, est=182.46354302124183, details={'was_impossible': False}),
 Prediction(uid='22189', iid=16686, r_ui=350.0, est=350.23979922412224, details={'was_impossible': False}),
 Prediction(uid='21733', iid=16014, r_ui=478.0, est=475.8232681018149, details={'was_impossible': False}),
 Prediction(uid='22846', iid=14096, r_ui=195.0, est=186.25200354547576, details={'was_impossible': False}),
 Prediction(uid='22938', iid=16225, r_ui=138.0, est=134.85648115741606, details={'was_impossible': False}),
 Prediction(uid='23182', iid=16984, r_ui=210.0, est=208.00013787383398, details={'was_impossible': False}),
 Prediction(uid='23158', iid=1

In [71]:
predictions_data = pd.DataFrame(pred2, columns=['item_id', 'customer_id', 'quantity', 'prediction', 'details'])

In [72]:
def get_item_orders(user_id):
  try:
    return len(train_set.ur[train_set.to_inner_uid(user_id)])
  except ValueError:
    return 0

def get_customer_orders(item_id):
  try:
    return len(train_set.ir[train_set.to_inner_iid(item_id)])
  except ValueError:
    return 0

predictions_data['item_orders'] = predictions_data.item_id.apply(get_item_orders)
predictions_data['customer_orders'] = predictions_data.customer_id.apply(get_customer_orders)

predictions_data['error'] = abs(predictions_data.prediction - predictions_data.quantity)

predictions_data

Unnamed: 0,item_id,customer_id,quantity,prediction,details,item_orders,customer_orders,error
0,22720,15356,889.0,889.077594,{'was_impossible': False},455,550,0.077594
1,22997,13458,133.0,133.091961,{'was_impossible': False},460,552,0.091961
2,22348,17389,190.0,189.489145,{'was_impossible': False},453,538,0.510855
3,23172,16360,178.0,182.463543,{'was_impossible': False},446,566,4.463543
4,22189,16686,350.0,350.239799,{'was_impossible': False},465,554,0.239799
...,...,...,...,...,...,...,...,...
77130,21914,15786,282.0,275.838338,{'was_impossible': False},447,552,6.161662
77131,22776,15984,372.0,376.482387,{'was_impossible': False},450,544,4.482387
77132,22751,16340,150.0,152.068183,{'was_impossible': False},438,557,2.068183
77133,21539,16477,201.0,200.084640,{'was_impossible': False},458,553,0.915360


In [73]:
best_predictions = predictions_data.sort_values(by='error')[:10]
best_predictions

Unnamed: 0,item_id,customer_id,quantity,prediction,details,item_orders,customer_orders,error
24424,85099C,16764,515.0,514.992047,{'was_impossible': False},451,549,0.007953
9241,22961,16764,717.0,716.992047,{'was_impossible': False},442,549,0.007953
46,85099F,16764,540.0,539.992047,{'was_impossible': False},455,549,0.007953
73592,22960,16764,717.0,716.992047,{'was_impossible': False},445,549,0.007953
62869,82482,16764,696.0,695.992047,{'was_impossible': False},462,549,0.007953
33202,23206,16764,699.0,698.992047,{'was_impossible': False},450,549,0.007953
13465,21931,16764,611.0,610.992047,{'was_impossible': False},446,549,0.007953
53572,21181,16764,544.0,543.992047,{'was_impossible': False},452,549,0.007953
29260,22457,16764,754.0,753.992047,{'was_impossible': False},459,549,0.007953
36106,22663,16764,252.0,251.992047,{'was_impossible': False},454,549,0.007953


In [74]:
worst_predictions = predictions_data.sort_values(by='error')[-10:]
worst_predictions

Unnamed: 0,item_id,customer_id,quantity,prediction,details,item_orders,customer_orders,error
42908,84946,16326,428.0,439.732187,{'was_impossible': False},448,534,11.732187
20285,21165,16326,293.0,304.732187,{'was_impossible': False},443,534,11.732187
67486,22607,16326,256.0,267.732187,{'was_impossible': False},454,534,11.732187
51330,22835,16326,254.0,265.732187,{'was_impossible': False},442,534,11.732187
12459,22502,16326,239.0,250.732187,{'was_impossible': False},441,534,11.732187
76699,22499,16326,272.0,283.732187,{'was_impossible': False},447,534,11.732187
18392,21169,16326,246.0,257.732187,{'was_impossible': False},471,534,11.732187
21217,22804,16326,345.0,356.732187,{'was_impossible': False},456,534,11.732187
64284,21889,16326,244.0,255.732187,{'was_impossible': False},458,534,11.732187
24157,22726,16326,537.0,548.732187,{'was_impossible': False},467,534,11.732187


In [75]:
#getting item list for user 12347

item_list = predictions_data[predictions_data['customer_id']==12347]['item_id'].values.tolist()
item_list

['22556',
 '21035',
 '21500',
 '22327',
 '22768',
 '23172',
 '22502',
 '21078',
 '21670',
 '22418',
 '21976',
 '21967',
 '84030E',
 '23032',
 '22467',
 '48187',
 '21673',
 '21523',
 '23170',
 '85099C',
 '22385',
 '23169',
 '22371',
 '21899',
 '22690',
 '22193',
 '84378',
 '37449',
 '22147',
 '84596B',
 '22741',
 '22938',
 '21876',
 '47591D',
 '22620',
 '21733',
 '22427',
 '21124',
 '84755',
 '21484',
 '22692',
 '20677',
 '15056N',
 '20972',
 '23237',
 '20723',
 '20717',
 '22925',
 '21936',
 '22927',
 '21718',
 '23159',
 '22728',
 '16161P',
 '84978',
 '84997C',
 '21658',
 '21982',
 '22952',
 '22553',
 '22720',
 '22508',
 '21914',
 '85132C',
 '21878',
 '21179',
 '22991',
 '23148',
 '48111',
 '22649',
 '22435',
 '22384',
 '22855',
 '20728',
 '21212',
 '21892',
 '37446',
 '21912',
 '23168',
 '82486',
 '21931',
 '21240',
 '22716',
 '84987',
 '21260',
 '85123A',
 '22219',
 '85053',
 '21238',
 '22670',
 '22083',
 '22672',
 '21165',
 '22139',
 '21901',
 '21715',
 '22645',
 '22585',
 '23232',
 

In [76]:
 customer_list = predictions_data[predictions_data['item_id'].isin(item_list)]['customer_id'].values
 customer_list = np.unique(customer_list).tolist()
 customer_list

[12347,
 12359,
 12362,
 12370,
 12378,
 12415,
 12417,
 12428,
 12431,
 12433,
 12444,
 12449,
 12451,
 12471,
 12472,
 12474,
 12476,
 12477,
 12481,
 12484,
 12490,
 12501,
 12502,
 12517,
 12520,
 12539,
 12540,
 12553,
 12567,
 12583,
 12621,
 12626,
 12627,
 12637,
 12662,
 12668,
 12681,
 12682,
 12683,
 12688,
 12700,
 12705,
 12708,
 12709,
 12714,
 12720,
 12721,
 12731,
 12743,
 12744,
 12748,
 12749,
 12753,
 12757,
 12766,
 12836,
 12839,
 12841,
 12867,
 12921,
 12949,
 12957,
 12971,
 13001,
 13004,
 13013,
 13018,
 13047,
 13050,
 13069,
 13078,
 13081,
 13089,
 13093,
 13097,
 13098,
 13102,
 13113,
 13124,
 13137,
 13139,
 13141,
 13148,
 13174,
 13178,
 13184,
 13198,
 13209,
 13230,
 13232,
 13263,
 13266,
 13267,
 13268,
 13269,
 13285,
 13317,
 13319,
 13334,
 13381,
 13408,
 13418,
 13448,
 13451,
 13458,
 13468,
 13488,
 13505,
 13507,
 13527,
 13534,
 13548,
 13555,
 13571,
 13593,
 13610,
 13634,
 13668,
 13694,
 13700,
 13709,
 13742,
 13764,
 13767,
 13777,


In [77]:
# filtering those customers from predictions data
filtered_data = predictions_data[predictions_data['customer_id'].isin(customer_list)]

# removing the items already bought
filtered_data = filtered_data[~filtered_data['item_id'].isin(item_list)]

# getting the top items (prediction)
recommended_items = filtered_data.sort_values('prediction',ascending=False).reset_index(drop=True).head(10)['item_id'].values.tolist()
recommended_items

['22423',
 '22423',
 '22423',
 '22423',
 '22423',
 '22423',
 '22423',
 '22423',
 '22423',
 '22423']