In [28]:
import numpy as np
import scipy
import seaborn as sns
import pandas as pd
import math
import random
import sklearn
import nltk
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [2]:
tx_data = pd.read_csv('OnlineRetail.csv', encoding = 'unicode_escape')
tx_data.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
tx_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


# Cleaning and Preprocessing

In [4]:
tx_data['StockCode'] = tx_data['StockCode'].astype(str)
tx_data['CustomerID'] = tx_data['CustomerID'].astype(str)
tx_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   541909 non-null  object 
 7   Country      541909 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 33.1+ MB


In [5]:
tx_data.isnull().sum()

InvoiceNo         0
StockCode         0
Description    1454
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID        0
Country           0
dtype: int64

In [6]:
tx_data = tx_data.dropna()
tx_data.shape

(540455, 8)

In [7]:
tx_data = tx_data[tx_data['Quantity'] > 0]
tx_data = tx_data[~(tx_data['CustomerID'] == 'nan')]
#tx_data = tx_data[tx_data['UnitPrice'] < 200]
not_product = ['Manual', 'POSTAGE', 'DOTCOM POSTAGE', 'BANK CHARGES', 'C2', 'PADS']
tx_data = tx_data[~(tx_data['Description'].isin(not_product))]
df_uk = tx_data[tx_data['Country'] == 'United Kingdom']
df_uk.shape

(354051, 8)

In [8]:
df_uk.duplicated().any()

True

In [9]:
df_uk = df_uk.drop_duplicates()
df_uk.shape

(348938, 8)

In [10]:
#filtering customer come from outside UK
df_uk['InvoiceDate'] = pd.to_datetime(df_uk['InvoiceDate'])
df_uk['Description'] = df_uk['Description'].apply(lambda x: str(x).lower())
df_uk.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [11]:
#defining price band simply as dividing it into 4 quartiles
def determine_outlier_thresholds_iqr(data, col_name, th1=0.25, th3=0.75):
    quartile1 = data[col_name].quantile(th1)
    quartile3 = data[col_name].quantile(th3)
    iqr = quartile3 - quartile1
    upper_limit = quartile3 + 1.5 * iqr
    lower_limit = quartile1 - 1.5 * iqr
    return lower_limit, upper_limit

df_full_clean = df_uk[(df_uk['UnitPrice'] > determine_outlier_thresholds_iqr(df_uk, 'UnitPrice')[0]) | (df_uk['UnitPrice'] < determine_outlier_thresholds_iqr(df_uk, 'UnitPrice')[1])]
df_full_clean['price_band'] = pd.qcut(df_full_clean['UnitPrice'], q = 4, labels = [1, 2, 3, 4])
df_full_clean

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,price_band
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,3
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,3
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,3
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,3
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,3
...,...,...,...,...,...,...,...,...,...
541889,581585,22466,fairy tale cottage night light,12,2011-12-09 12:31:00,1.95,15804.0,United Kingdom,2
541890,581586,22061,large cake stand hanging strawbery,8,2011-12-09 12:49:00,2.95,13113.0,United Kingdom,3
541891,581586,23275,set of 3 hanging owls ollie beak,24,2011-12-09 12:49:00,1.25,13113.0,United Kingdom,1
541892,581586,21217,red retrospot round cake tins,24,2011-12-09 12:49:00,8.95,13113.0,United Kingdom,4


In [26]:
def smooth_user_preference(x):
    return math.log(1+x, 2)

df_x = df_full_clean.groupby(['CustomerID', 'StockCode'])['price_band'].sum().apply(smooth_user_preference) \
                                                                                                    .reset_index()
df_x.head()

Unnamed: 0,CustomerID,StockCode,price_band
0,12346.0,23166,1.0
1,12747.0,20711,1.584963
2,12747.0,21136,3.70044
3,12747.0,21745,3.70044
4,12747.0,21754,4.392317


In [27]:
print('Total User: {}\nTotal Product: {}'.format(len(df_full_clean['CustomerID'].unique()), 
                                                  len(df_full_clean['StockCode'].unique())))

Total User: 3918
Total Product: 3642


# Content-Based Recommendation

In [21]:
stopwords_list = nltk.corpus.stopwords.words('english')

vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

item_ids = df_full_clean['StockCode'].tolist()
tfidf_matrix = vectorizer.fit_transform(df_full_clean['Description'])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<348938x399 sparse matrix of type '<class 'numpy.float64'>'
	with 1393500 stored elements in Compressed Sparse Row format>

In [22]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(pd.Series(interactions_person_df['StockCode']))
    
    user_item_strengths = np.array(interactions_person_df['price_band']).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_item_strengths_weighted_avg = np.nan_to_num(user_item_strengths_weighted_avg)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_users_profiles(): 
    interactions_indexed_df = df_x[df_x['StockCode'].isin(df_full_clean['StockCode'])].set_index('CustomerID')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
    return user_profiles

In [23]:
user_profiles = build_users_profiles()
len(user_profiles)

3918

In [24]:
myprofile = user_profiles['18287.0']
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles['18287.0'].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 399)


Unnamed: 0,token,relevance
0,set,0.379766
1,lights,0.268807
2,christmas,0.249354
3,babushka,0.227764
4,tree,0.220371
5,pink,0.199898
6,flower,0.19811
7,butterfly,0.197622
8,boxes,0.193288
9,cream,0.163917


In [25]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        #user_id = input("Reading User ID: ")
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['StockCode', 'recStrength'])
        recommendations_df = recommendations_df.drop_duplicates(subset = 'StockCode')
        df_item_unique = self.items_df.drop_duplicates(subset = 'StockCode')

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendation_df = recommendations_df.merge(df_item_unique, how = 'left', 
                                                          left_on = 'StockCode', 
                                                          right_on = 'StockCode')[['recStrength', 'StockCode', 'Description']]

        #print('Recommendation for you: ')
            return recommendation_df
    
content_based_recommender_model = ContentBasedRecommender(df_full_clean)

In [26]:
content_based_recommender_model.recommend_items('18287.0', topn=10000, verbose = True)

Unnamed: 0,recStrength,StockCode,Description
0,0.489771,20878,set/9 christmas t-lights scented
1,0.487865,72351B,set/6 pink butterfly t-lights
2,0.445049,72351A,set/6 turquoise butterfly t-lights
3,0.445049,72349B,set/6 purple butterfly t-lights
4,0.421692,22810,set of 6 t-lights snowmen
5,0.421692,72586,set of 6 halloween ghost t-lights
6,0.421692,22809,set of 6 t-lights santa
7,0.421692,85231G,orange scented set/9 t-lights
8,0.421692,22807,set of 6 t-lights toadstools
9,0.421692,85231B,cinammon set of 9 t-lights


In [28]:
df_person = df_full_clean[df_full_clean['CustomerID'] == '18287.0']
pd.DataFrame(df_person['Description'].value_counts()[:10]).rename(columns = {'Description': 'Total Purchase'})

Unnamed: 0,Total Purchase
s/4 ivory mini rose candle in bowl,3
set/6 purple butterfly t-lights,2
swiss chalet tree decoration,2
ice cream sundae lip gloss,2
assorted colour lizard suction hook,2
ice cream pen lip gloss,2
set/4 red mini rose candle in bowl,2
set of 3 wooden sleigh decorations,2
painted metal star with holly bells,2
s/4 pink flower candles in bowl,2


# Collaborative Recommendation

In [70]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = df_x.pivot(index='CustomerID', 
                                                          columns='StockCode', 
                                                          values='price_band').fillna(0)

users_items_pivot_matrix_df.head(10)

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214S,90214T,90214U,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,PADS
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12747.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12748.0,1.0,0.0,1.584963,0.0,0.0,0.0,0.0,2.584963,1.0,2.321928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12749.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.321928,0.0
12820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12821.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12822.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12823.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12824.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12826.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
users_items_pivot_matrix = np.array(users_items_pivot_matrix_df)
users_items_pivot_matrix[:10]

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [1.       , 0.       , 1.5849625, ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [72]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

['12346.0',
 '12747.0',
 '12748.0',
 '12749.0',
 '12820.0',
 '12821.0',
 '12822.0',
 '12823.0',
 '12824.0',
 '12826.0']

In [73]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)
users_items_pivot_sparse_matrix

<3918x3642 sparse matrix of type '<class 'numpy.float64'>'
	with 237848 stored elements in Compressed Sparse Row format>

In [74]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [75]:
U.shape

(3918, 15)

In [76]:
Vt.shape

(15, 3642)

In [77]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [78]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[-1.06358654e-04, -3.20598803e-05, -1.25677440e-04, ...,
        -5.63817095e-05,  9.39450792e-05,  3.15728992e-05],
       [ 2.04253488e-02, -3.49346000e-03,  2.50736426e-02, ...,
         1.55074203e-02,  3.81826757e-03,  2.45210025e-03],
       [ 4.22791302e-01,  1.52338348e-01,  7.39624655e-01, ...,
         3.28402426e-01,  4.03233586e-02,  2.17765610e-02],
       ...,
       [ 5.35190372e-04,  1.82478237e-03, -1.34281333e-03, ...,
         4.21083101e-03,  1.20465343e-02,  1.75189584e-03],
       [ 9.74894602e-02,  5.95506500e-03,  6.42341977e-02, ...,
        -5.50681720e-04,  1.12308512e-02, -6.06749708e-03],
       [ 3.32979831e-03,  2.37781318e-03,  1.01704250e-02, ...,
         7.10700371e-03, -2.87511895e-03, -6.32600574e-04]])

In [79]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [80]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,12346.0,12747.0,12748.0,12749.0,12820.0,12821.0,12822.0,12823.0,12824.0,12826.0,...,18273.0,18274.0,18276.0,18277.0,18278.0,18280.0,18281.0,18282.0,18283.0,18287.0
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0.327396,0.32874,0.355089,0.327676,0.327543,0.327512,0.327446,0.327122,0.327246,0.32905,...,0.327409,0.326815,0.327718,0.327513,0.327151,0.32801,0.327686,0.327438,0.333787,0.327621
10080,0.327401,0.327174,0.337378,0.327398,0.32803,0.327452,0.327794,0.327309,0.327735,0.327794,...,0.327397,0.327377,0.327391,0.327493,0.327446,0.327539,0.32746,0.327522,0.327793,0.327558
10120,0.327394,0.329045,0.375837,0.326249,0.328353,0.327548,0.328104,0.327152,0.327734,0.328267,...,0.327402,0.326431,0.327868,0.327574,0.327231,0.327605,0.327534,0.327315,0.331609,0.328069
10123C,0.327401,0.327475,0.3273,0.327322,0.327415,0.327409,0.327377,0.327382,0.327401,0.327422,...,0.3274,0.327382,0.327406,0.327409,0.327379,0.327397,0.327401,0.327382,0.327606,0.3274
10124A,0.327402,0.327089,0.328268,0.327927,0.327373,0.327401,0.327615,0.327399,0.3274,0.327689,...,0.327398,0.327491,0.32739,0.327386,0.327532,0.327329,0.327447,0.327446,0.328265,0.327497
10124G,0.327403,0.327092,0.329396,0.327548,0.327401,0.32739,0.327504,0.327393,0.327401,0.327636,...,0.327398,0.327454,0.327426,0.327366,0.327348,0.327297,0.327423,0.3273,0.329187,0.327395
10125,0.327401,0.325823,0.348563,0.32678,0.328189,0.327431,0.327957,0.327344,0.327759,0.32923,...,0.327403,0.327379,0.327719,0.32742,0.326884,0.327425,0.327695,0.327432,0.334928,0.32758
10133,0.327416,0.328544,0.404979,0.333208,0.329667,0.327501,0.328576,0.326825,0.328242,0.328065,...,0.327384,0.327706,0.327961,0.327517,0.327031,0.327695,0.327429,0.327529,0.339983,0.327831
10135,0.327356,0.325908,0.395486,0.32597,0.328624,0.327803,0.327019,0.327643,0.328718,0.325572,...,0.327319,0.327331,0.327927,0.327648,0.327448,0.327224,0.326918,0.327577,0.333901,0.330513
11001,0.327342,0.325594,0.420105,0.321813,0.328821,0.327839,0.328221,0.327997,0.328966,0.328108,...,0.32733,0.326324,0.327883,0.32772,0.327255,0.326991,0.327429,0.327578,0.335601,0.330625


In [82]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['StockCode'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)
        df_item_unique = self.items_df.drop_duplicates(subset = 'StockCode')

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendation_df = recommendations_df.merge(df_item_unique, how = 'left', 
                                                          left_on = 'StockCode', 
                                                          right_on = 'StockCode')[['recStrength', 'StockCode', 'Description']]

        return recommendation_df
    
cf_recommender_model = CFRecommender(cf_preds_df, df_full_clean)

In [93]:
cf_recommender_model.recommend_items('18287.0', topn = 10, verbose = True)

Unnamed: 0,recStrength,StockCode,Description
0,0.364673,22111,scottie dog hot water bottle
1,0.364344,23355,hot water bottle keep calm
2,0.364269,22112,chocolate hot water bottle
3,0.359462,22114,hot water bottle tea and sympathy
4,0.358875,22835,hot water bottle i am so poorly
5,0.354415,21485,retrospot heart hot water bottle
6,0.350093,22113,grey heart hot water bottle
7,0.350051,22865,hand warmer owl design
8,0.349607,23356,love hot water bottle
9,0.3492,22866,hand warmer scotty dog design


# Hybrid Recommendation

In [84]:
class HybridRecommender:
    
    MODEL_NAME = 'Hybrid'
    
    def __init__(self, cb_rec_model, cf_rec_model, items_df, cb_ensemble_weight=1.0, cf_ensemble_weight=1.0):
        self.cb_rec_model = cb_rec_model
        self.cf_rec_model = cf_rec_model
        self.cb_ensemble_weight = cb_ensemble_weight
        self.cf_ensemble_weight = cf_ensemble_weight
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        #Getting the top-1000 Content-based filtering recommendations
        cb_recs_df = self.cb_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCB'})
        
        #Getting the top-1000 Collaborative filtering recommendations
        cf_recs_df = self.cf_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose, 
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCF'})
        
        #Combining the results by contentId
        recs_df = cb_recs_df.merge(cf_recs_df,
                                   how = 'outer', 
                                   left_on = 'StockCode', 
                                   right_on = 'StockCode').fillna(0.0)
        
        #Computing a hybrid recommendation score based on CF and CB scores
        #recs_df['recStrengthHybrid'] = recs_df['recStrengthCB'] * recs_df['recStrengthCF'] 
        recs_df['recStrengthHybrid'] = (recs_df['recStrengthCB'] * self.cb_ensemble_weight) \
                                     + (recs_df['recStrengthCF'] * self.cf_ensemble_weight)
        
        #Sorting recommendations by hybrid score
        recommendations_df = recs_df.sort_values('recStrengthHybrid', ascending=False).head(topn)
        df_item_unique = self.items_df.drop_duplicates(subset = 'StockCode')

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendation_df = recommendations_df.merge(df_item_unique, how = 'left', 
                                                          left_on = 'StockCode', 
                                                          right_on = 'StockCode')[['recStrengthHybrid', 'StockCode', 
                                                                                   'Description']]

        return recommendation_df
    
hybrid_recommender_model = HybridRecommender(content_based_recommender_model, cf_recommender_model, df_full_clean,
                                             cb_ensemble_weight=1.0, cf_ensemble_weight=100.0)

In [92]:
hybrid_recommender_model.recommend_items('18287.0', items_to_ignore=[], topn=10, verbose=True)

Unnamed: 0,recStrengthHybrid,StockCode,Description
0,36.467298,22111,scottie dog hot water bottle
1,36.434426,23355,hot water bottle keep calm
2,36.426893,22112,chocolate hot water bottle
3,35.946246,22114,hot water bottle tea and sympathy
4,35.887458,22835,hot water bottle i am so poorly
5,35.441481,21485,retrospot heart hot water bottle
6,35.009348,22113,grey heart hot water bottle
7,35.005108,22865,hand warmer owl design
8,34.960658,23356,love hot water bottle
9,34.920017,22866,hand warmer scotty dog design
