# 1.Import need library

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from lightfm import LightFM
from scipy import sparse
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

# 2. Clean data

In [2]:
## Load data
df = pd.read_csv("raw_data.csv", dtype = {'CustomerID': str, 'InvoiceNo': str,'StockCode':str})

## Filter all null CustomerID infor
df = df[~df['CustomerID'].isna()]

## Only pick Quantity > 0
df = df[df.Quantity > 0]

## Filter CustomerID has GMV > 50k  (according assumption)
df['TotalPrice'] = df['Quantity']*df['UnitPrice'] + 0.5
df = df[~df['TotalPrice'].isna()]

gmv = df.groupby('CustomerID').agg({'TotalPrice':'sum'}).reset_index()
gmv = gmv[gmv.TotalPrice > 50e3]
df = df[~df.CustomerID.isin(gmv.CustomerID)]

In [3]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,17850,United Kingdom,15.80
1,536365,71053,WHITE METAL LANTERN,6,3.39,17850,United Kingdom,20.84
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2.75,17850,United Kingdom,22.50
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.39,17850,United Kingdom,20.84
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,3.39,17850,United Kingdom,20.84
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,0.85,12680,France,10.70
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2.10,12680,France,13.10
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,4.15,12680,France,17.10
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,4.15,12680,France,17.10


# 3. Preprocess data

## 3.1 Build matrix represents the user's purchasing relationship

In [4]:
product_buy_matrix = df.pivot_table(index='CustomerID', columns='StockCode', values='Quantity', fill_value=0, aggfunc='sum')
product_buy_matrix = product_buy_matrix.reset_index().rename_axis(None, axis=1)

## 3.2 Create ratings of customers for each item

In [6]:
product_buy_matrix

Unnamed: 0,CustomerID,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,...,90214U,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,M,PADS,POST
0,12347,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,12348,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
2,12349,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,12350,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,12352,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4314,18280,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4315,18281,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4316,18282,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4317,18283,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


# 4. Build the recommendation engine 

## 4.1. Config

In [7]:
n_comp = 10
top_k = 20

## 4.2. Build Recommendation Egnine (Using matric factorization)

### 4.2.1 Product-Product Similarirty

In [8]:
rating_matrix = product_buy_matrix.set_index(['CustomerID'])
rating_matrix = rating_matrix.div(rating_matrix.sum(axis=1), axis=0).astype(float)
rating_matrix = rating_matrix.T

SVD = TruncatedSVD(n_components=n_comp)
decomposed_matrix = SVD.fit_transform(rating_matrix)
correlation_matrix = np.corrcoef(decomposed_matrix)

top_n_product = np.argsort(correlation_matrix,axis=1)[:,(-top_k-1):-1]
item_rec = []
for k_product in top_n_product:
    item_rec.append(list(rating_matrix.index[k_product]))
product_df = pd.DataFrame(index = rating_matrix.index, data = item_rec)
product_df = product_df.reset_index().rename(columns={"index":"product"})

### 4.2.2 User-Items Rating

In [9]:
rating_matrix = product_buy_matrix.set_index(['CustomerID'])
rating_matrix = rating_matrix.div(rating_matrix.sum(axis=1), axis=0).astype(float)
train_data = sparse.csr_matrix(rating_matrix)

model = LightFM(loss='warp')
model.fit(train_data, epochs=30, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fbb074c0668>

### 4.2.3 Decision Merger

In [11]:
products_index = rating_matrix.columns.tolist()
user_id = rating_matrix.index.tolist()
def get_top_k_items(user,product,top_n_product=top_n_product,model=model,user_id=user_id,products_index=products_index):
    uuid = user_id.index(user)
    product_id = products_index.index(product)
    scores = model.predict(uuid, np.arange(train_data.shape[1]))
    top_sort = np.argsort(-scores)
    
    top_product_rec = []
    for pro_id in top_n_product[product_id]:
        top_product_rec.append(np.where(top_sort==pro_id)[0][0])
        
    top_product_id = np.argsort(np.array(top_product_rec))[:5].tolist()
    
    return [products_index[i] for i in top_product_id]

In [12]:
get_top_k_items(user='12348',product='10002')

['11001', '10124G', '15056N', '15044D', '10123C']