### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import os
from PIL import Image

%matplotlib inline

In [2]:
articles = pd.read_csv('articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [3]:
customers = pd.read_csv('customers.csv')
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [4]:
transactions = pd.read_csv('transactions_train.csv', parse_dates=['t_dat'])
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


### Non-personalised

In [5]:
from datetime import timedelta

In [6]:
def non_personalised(k, n, data):
    transactions = data.copy()
    df = transactions[transactions.t_dat > transactions.t_dat.max()-timedelta(days=n)]
    df = df.groupby('article_id')['t_dat'].count().reset_index()
    df.columns = ['article_id', 'no_of_times_sold']
    df = df.sort_values('no_of_times_sold', ascending = False)
    df_topk = df[:k].reset_index(drop = True)
    
    return df_topk

### Data Preprocessing

In [7]:
def split_train_test(n, data):
    trans = data.copy()
    transactions_ndays = data[data.t_dat > data.t_dat.max()-timedelta(days=n)]
    test_data = transactions_ndays
    trans.drop(trans.tail(len(transactions_ndays)).index, inplace = True)
    train_data = trans
    return train_data, test_data

In [8]:
def remove_new_customers(data1, data2):
    cust_list1 = list(data1['customer_id'].unique())
    cust_list2 = list(data2['customer_id'].unique())
    for customer in cust_list1:
        if customer not in cust_list2:
            data1 = data1.drop(labels=data1[data1['customer_id'] == customer].index)
    return data1

In [21]:
import random
random.seed(42)

def downsize(data, n):
    unique_cust_ids = list(data['customer_id'].unique())
    sample_cust_ids = random.sample(unique_cust_ids, n)
    downsized_data = data[data['customer_id'].isin(sample_cust_ids)]
    return downsized_data

In [16]:
def filter_training_data(n, data):
    trans = data.copy()
    transactions_nmonths = trans[trans.t_dat > trans.t_dat.max()-timedelta(days=n*30)]
    return transactions_nmonths

Splitting the data into Training and Testing data

In [12]:
train, test = split_train_test(7, transactions)

In [17]:
filtered_train_data = filter_training_data(6, train)
filtered_train_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
23548790,2020-03-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,838787001,0.025407,2
23548791,2020-03-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,870282001,0.021593,2
23548792,2020-03-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,870282001,0.021593,2
23548793,2020-03-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,855262002,0.035576,2
23548794,2020-03-20,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,855262002,0.050831,2
...,...,...,...,...,...
31548008,2020-09-15,ffe41634ff990908faacbb465063e027e7c39499f8dfc1...,850917001,0.025407,1
31548009,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,853316001,0.008458,1
31548010,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,296366006,0.000847,1
31548011,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,789769001,0.013542,1


In [23]:
test_data_1000 = downsize(test, 1000)
test_data_1000

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31548606,2020-09-16,0593147b5c32fbf1f2b89bc0375a122043e3545b6ad8e9...,892558006,0.016932,1
31549113,2020-09-16,09e3c85382d95a44ecb9d162b02d58b54960a1d2ab4ced...,874587001,0.030492,1
31550031,2020-09-16,126bb7890b637f839a476431158c6507267cccc40c60b7...,628722004,0.010153,2
31550032,2020-09-16,126bb7890b637f839a476431158c6507267cccc40c60b7...,850984002,0.008458,2
31550033,2020-09-16,126bb7890b637f839a476431158c6507267cccc40c60b7...,628722005,0.010153,2
...,...,...,...,...,...
31788223,2020-09-22,feede16a97612dbe30e7157dce5797b78cab0eda03e146...,919273001,0.042356,2
31788224,2020-09-22,feede16a97612dbe30e7157dce5797b78cab0eda03e146...,919273002,0.042356,2
31788225,2020-09-22,feede16a97612dbe30e7157dce5797b78cab0eda03e146...,919273002,0.042356,2
31788308,2020-09-22,ffcd5ff3fd5b9ee6b984c143c973fd2697ca141483d381...,918522001,0.042356,2


In [25]:
test_data_without_new_cust = remove_new_customers(test, filtered_train_data)
test_data_without_new_cust

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31548013,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,786022008,0.048441,2
31548014,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,913272003,0.032288,2
31548015,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,889669006,0.056508,2
31548016,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,237347060,0.033881,1
31548017,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,562245001,0.013542,1
...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1


In [26]:
test_data_without_new_cust_1000 = downsize(test_data_without_new_cust, 1000)
test_data_without_new_cust_1000

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31548710,2020-09-16,062563bfb598a256dd0a4fe893f87d9e54b4344fed28b0...,878223001,0.010153,1
31548797,2020-09-16,075739ea83889fc71c3794c064bdb7f7850781e7a960bb...,803160001,0.033881,2
31548798,2020-09-16,075739ea83889fc71c3794c064bdb7f7850781e7a960bb...,858610001,0.042356,2
31549008,2020-09-16,094b88ae8abe27be07a16f304d25605dbaf16bb711b9bf...,829135004,0.040678,2
31549009,2020-09-16,094b88ae8abe27be07a16f304d25605dbaf16bb711b9bf...,829135004,0.040661,2
...,...,...,...,...,...
31787798,2020-09-22,fb3cb15d47ea95b54486c1f3a5d953e35a59b4b993d428...,685814062,0.033881,2
31787799,2020-09-22,fb3cb15d47ea95b54486c1f3a5d953e35a59b4b993d428...,837249024,0.016932,2
31787800,2020-09-22,fb3cb15d47ea95b54486c1f3a5d953e35a59b4b993d428...,794191004,0.022017,2
31788139,2020-09-22,fe21dc8c4a784e405f67ed049ab80ae49c7d3a0cf5534b...,878828001,0.059305,2


#### Converting test data into dictionary of customers and articles

In [27]:
test_df = test_data_1000.copy()
test_df['article_id'] = test_df['article_id'].astype(str) + ' '
test_df = pd.DataFrame(test_df.groupby('customer_id').article_id.sum()).reset_index()

In [28]:
test_df_without_new_cust = test_data_without_new_cust_1000.copy()
test_df_without_new_cust['article_id'] = test_df_without_new_cust['article_id'].astype(str) + ' '
test_df_without_new_cust = pd.DataFrame(test_df_without_new_cust.groupby('customer_id').article_id.sum()).reset_index()

#### Converting training data into dictionary of customers and articles

In [29]:
train_df = filtered_train_data.copy()
train_df['article_id'] = train_df['article_id'].astype(str) + ' '
train_df = pd.DataFrame(train_df.groupby('customer_id').article_id.sum()).reset_index()

#### Creating dictionary of product codes and articles

In [30]:
articles_product_code_df = articles.copy()
articles_product_code_df['article_id'] = articles_product_code_df['article_id'].astype(str) + ' '
articles_product_code_df = pd.DataFrame(articles_product_code_df.groupby(
    'product_code').article_id.sum()).reset_index()

#### Creating dictionary of product types and articles

In [31]:
articles_product_type_df = articles.copy()
articles_product_type_df['article_id'] = articles_product_type_df['article_id'].astype(str) + ' '
articles_product_type_df = pd.DataFrame(articles_product_type_df.groupby(
    'product_type_no').article_id.sum()).reset_index()

### Content-Based

In [32]:
def content_based(n, k, data, articles_product_code_df, articles_product_type_df):
    recommendations = {}
    for cust in data:
        if (cust in list(train_df['customer_id'])):
            article_ids = train_df[train_df['customer_id']==cust]['article_id'].array[0][:-1].split(' ')
            
            if len(article_ids) >= n:
                article_ids = article_ids[:n]
            
            no_recs_each_item = k//len(article_ids)
            no_extra_rec = k%len(article_ids)
            rec_articles = []
            for a in article_ids:
                p_code = articles[articles['article_id']==int(a)]['product_code'].array[0]
                p_type = articles[articles['article_id']==int(a)]['product_type_no'].array[0]
        
                rec_article_ids = articles_product_code_df[articles_product_code_df['product_code']==p_code]['article_id'].array[0][:-1].split(' ')
            
                rec_article_ids.remove(str(a))
                if(len(rec_article_ids) < no_recs_each_item):
                    more_rec_article_ids = articles_product_type_df[articles_product_type_df['product_type_no']==p_type]['article_id'].array[0][:-1].split(' ')
                
                    for i in more_rec_article_ids:
                        if i not in rec_article_ids:
                            rec_article_ids.append(i)
                    rec_article_ids.remove(str(a))
                rec_list = rec_article_ids[:no_recs_each_item]
                
                if(no_extra_rec !=0):
                    rec_list = rec_article_ids[no_recs_each_item:no_extra_rec]

                rec_articles.extend(rec_list)
            if(len(rec_articles) < k):
                sub = k-len((rec_articles))
                for i in range(sub):
                    rec_articles.append(0)

            recommendations[cust]=rec_articles
        else:
            recommendations[cust]=[0]*k
    
    return recommendations

### User-Item Matrix

In [33]:
trans_df = transactions.copy()

In [34]:
ALL_USERS = trans_df['customer_id'].unique().tolist()
ALL_ITEMS = trans_df['article_id'].unique().tolist()
user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))
user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}
trans_df['user_id'] = trans_df['customer_id'].map(user_map)
trans_df['item_id'] = trans_df['article_id'].map(item_map)

In [36]:
train, test = split_train_test(7, trans_df)
filtered_train_data = filter_training_data(6, train)

In [37]:
user_item_df = filtered_train_data.groupby(['user_id', 'item_id'])['t_dat'].count().reset_index()
user_item_df = user_item_df.rename(columns = {'t_dat': 'purchase_count'})

In [38]:
values = user_item_df.purchase_count.values
row = user_item_df.user_id.values
column = user_item_df.item_id.values

In [39]:
from scipy.sparse import csr_matrix

In [40]:
sparse_user_item_matrix = csr_matrix((values, (row, column)), shape=(len(ALL_USERS), len(ALL_ITEMS)))

#### Importing ALS from Implicit Library

In [41]:
from implicit.als import AlternatingLeastSquares

In [44]:
model = AlternatingLeastSquares(factors=40, iterations=10, random_state=42)
model.fit(sparse_user_item_matrix)

100%|██████████| 10/10 [00:44<00:00,  4.49s/it]


### ALS Recommendation

In [45]:
def als_recommend(n: int, cust_id_list: list, data: pd.DataFrame, sparse_matrix: object)->list:
    '''
    Uses ALS to generate recommendations for customers
    ---------------
    parameters:
        n: number of recommended items
        data: list of customer_ids
    '''
    recommendations = []
    for cust in cust_id_list:
        user_id = data[data['customer_id']==cust]['user_id'].array[0]
        item_ids = model.recommend(userid=user_id, N=n, user_items=sparse_matrix[user_id])[0]
        article_ids = []
        for item in item_ids:
            article_id = data[data['item_id']==item]['article_id'].array[0]
            article_ids.append(article_id)
        recommendations.append(article_ids)
    return recommendations

### Defining MAP

In [46]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

### Defining Average Precision

In [47]:
def avg_precision(actual, predicted):
    precision=0
    for act,pred in zip(actual, predicted):
        sum = 0
        for p in pred:
            if p in act:
                sum+=1.0
        precision+=(sum/len(pred))
        
    return precision/len(actual)

### Defining Average Recall

In [48]:
def avg_recall(actual, predicted):
    recall=0
    for act,pred in zip(actual, predicted):
        sum = 0
        for p in pred:
            if p in act:
                sum+=1.0
        recall+=(sum/len(act))
        
    return recall/len(actual)

### Evaluating Non-personalised recommendations

In [49]:
def non_personalised_multiple_recommendations(n, data, test_df):
    mapk_list=[]
    avg_precision_list=[]
    avg_recall_list=[]
    for k in range (1, 101):
        df_k_recommendations = non_personalised(k, n, data)

        #Creating list of predicted items
        np_recs = df_k_recommendations['article_id'].values
        recs = list(np_recs)
        p_list = []
        for i in range(len(test)):
            p_list.append(recs)
        
        #Creating list of actual items bought by the user
        articles_list = test_df['article_id'].to_list()
        a_list = []
        for a in articles_list:
            arr=a[:-1].split(' ')
            arr=[int(value) for value in arr]
            a_list.append(arr)

        
        mapk_list.append(mapk(a_list, p_list, k))
        avg_precision_list.append(avg_precision(a_list, p_list))
        avg_recall_list.append(avg_recall(a_list, p_list))

    return mapk_list, avg_precision_list, avg_recall_list

In [51]:
mapk_vals_np, avg_precision_vals_np, avg_recall_vals_np = non_personalised_multiple_recommendations(7, filtered_train_data, test_df)

In [58]:
k_list = [10, 50, 100]
print('MAP')
for val in k_list:
    print('k='+ str(val) +': '+str(mapk_vals_np[val-1]))
print('Avg. Precision')
for val in k_list:
    print('k='+ str(val) +': '+str(avg_precision_vals_np[val-1]))
print('Avg. Recall')
for val in k_list:
    print('k='+ str(val) +': '+str(avg_recall_vals_np[val-1]))

MAP
k=10: 0.009547908163265307
k=50: 0.011660491818557408
k=100: 0.012413384872537864
Avg. Precision
k=10: 0.006699999999999995
k=50: 0.004559999999999996
k=100: 0.003679999999999982
Avg. Recall
k=10: 0.025309761494544097
k=50: 0.0732063752338636
k=100: 0.11438894109693147


### Evaluating Content-Based Recommendations

In [61]:
def content_based_multiple_recommendations(n, data, test_df, articles_product_code_df, articles_product_type_df):
    mapk_list=[]
    avg_precision_list=[]
    avg_recall_list=[]
    for k in range (1, 101):
        cust_ids = list(test_df['customer_id'])
        content_based_recs = content_based(5, k, cust_ids, articles_product_code_df, articles_product_type_df)

        customer_ids = []
        recommended_article_ids = []
        for rec in content_based_recs:
            customer_ids.append(rec)
            recommended_article_ids.append(content_based_recs[rec])
        content_based_recommendations = pd.DataFrame(zip(customer_ids, recommended_article_ids))
        content_based_recommendations.columns = ['customer_id', 'recommended_article_id']

        actual_predicted_df = pd.merge(content_based_recommendations, test_df_without_new_cust, on='customer_id')

        #Creating list of predicted items
        predicted_list = actual_predicted_df['recommended_article_id'].to_list()
        predicted = []
        for p in predicted_list:
            p_int=[int(value) for value in p]
            predicted.append(p_int)
        
        #Creating list of actual items bought by the user
        articles_list = test_df['article_id'].to_list()
        actual = []
        for a in articles_list:
            arr=a[:-1].split(' ')
            arr=[int(value) for value in arr]
            actual.append(arr)

        
        mapk_list.append(mapk(actual, predicted, k))
        avg_precision_list.append(avg_precision(actual, predicted))
        avg_recall_list.append(avg_recall(actual, predicted))

    return mapk_list, avg_precision_list, avg_recall_list
        


In [62]:
mapk_vals_cb, avg_precision_vals_cb, avg_recall_vals_cb = content_based_multiple_recommendations(7, filtered_train_data, test_df, articles_product_code_df, articles_product_type_df)

In [None]:
k_list = [10, 50, 100]
print('MAP')
for val in k_list:
    print('k='+ str(val) +': '+str(mapk_vals_cb[val-1]))
print('Avg. Precision')
for val in k_list:
    print('k='+ str(val) +': '+str(avg_precision_vals_cb[val-1]))
print('Avg. Recall')
for val in k_list:
    print('k='+ str(val) +': '+str(avg_recall_vals_cb[val-1]))

### Evaluating ALS