In [2]:
import pandas as pd
import numpy as np

In [3]:
# read sample transaction data
data = pd.read_csv('../../data/transaction_data_sample.csv')
data.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price
0,2018-09-20,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,652075001,0.011847
1,2018-09-20,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,670295001,0.010153


In [4]:
# read item metadata for reference
items = pd.read_csv('../../data/articles.csv')
items = items[['article_id', 'prod_name', 'product_type_name', 'product_group_name', 
                     'graphical_appearance_name', 'index_group_name', 'section_name',
                     'colour_group_name', 'perceived_colour_value_name']].copy()

# combining all columns except the id and performing ordered deduplication
items['combined'] = items.iloc[:,1:].agg(' '.join, axis=1)
items = items[['article_id', 'combined']].copy()

items.head(2)

Unnamed: 0,article_id,combined
0,108775015,Strap top Vest top Garment Upper body Solid La...
1,108775044,Strap top Vest top Garment Upper body Solid La...


In [5]:
i2i = data[['customer_id', 'article_id']].copy()
i2i = i2i.merge(items[['article_id', 'combined']], on=['article_id'], how='left')
pidcount = dict(i2i.groupby(['article_id']).size().reset_index(name='count').values)

i2i = i2i.merge(i2i, on=['customer_id'])
i2i = i2i[i2i['article_id_x'] != i2i['article_id_y']].copy()
i2i.head(2)

Unnamed: 0,customer_id,article_id_x,combined_x,article_id_y,combined_y
1,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,652075001,Stork fancy T-shirt Garment Upper body Solid D...,670295001,CSP Hackney tank Vest top Garment Upper body S...
2,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,652075001,Stork fancy T-shirt Garment Upper body Solid D...,678153002,Stud Jumper Sweater Garment Upper body Melange...


# Co-occurence

In [6]:
i2i = i2i.groupby(['article_id_x', 'article_id_y', 'combined_x', 'combined_y']).size().reset_index(name='co_occurence')
i2i = i2i[i2i['co_occurence'] > 1].copy()
co_occurence = i2i.sort_values(['article_id_x', 'co_occurence'], ascending=[True, False]).groupby(['article_id_x'], sort=False).head(10)

In [34]:
co_occurence = data[['customer_id', 'article_id']].drop_duplicates().merge(co_occurence.rename(columns={"article_id_x": 'article_id'}), on=['article_id'])

co_occurence = co_occurence.groupby(["customer_id", "article_id_y"]).agg({"co_occurence": "sum"}).reset_index()

co_occurence = co_occurence[['customer_id', 'article_id_y', 'co_occurence']].rename(columns={'article_id_y': 'article_id'})
co_occurence.sort_values(["customer_id", "co_occurence"], ascending=[True, False], inplace=True)
co_occurence.head()

Unnamed: 0,customer_id,article_id,co_occurence
60,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016002,184
61,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016003,101
63,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016015,81
62,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016006,66
29,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,539723001,56


In [35]:
co_occurence.to_csv('./co_occurence_recommendation.csv', index=False)

# Collaborative Filtering

In [36]:
i2i['union'] = i2i['article_id_x'].map(pidcount) +  i2i['article_id_y'].map(pidcount) - i2i['co_occurence']

i2i['cf_score'] = i2i['co_occurence'] / i2i['union']
i2i = i2i.sort_values(['article_id_x', 'cf_score'], ascending=[True, False]).groupby(['article_id_x'], sort=False).head(10)

i2i.head(2)

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence,union,cf_score
0,108775015,108775044,Strap top Vest top Garment Upper body Solid La...,Strap top Vest top Garment Upper body Solid La...,14,122,0.114754
339,108775015,538699001,Strap top Vest top Garment Upper body Solid La...,V-neck strap top Vest top Garment Upper body S...,16,147,0.108844


In [39]:
cf = data[['customer_id', 'article_id']].drop_duplicates().merge(i2i.rename(columns={"article_id_x": 'article_id'}), on=['article_id'])

cf = cf.groupby(["customer_id", "article_id_y"]).agg({"co_occurence": "sum", "union": "sum"}).reset_index()
cf["cf_score"] = cf["co_occurence"] / cf["union"]

cf = cf[['customer_id', 'article_id_y', 'cf_score']].rename(columns={'article_id_y': 'article_id'})
cf.sort_values(["customer_id", "cf_score"], ascending=[True, False], inplace=True)
cf.head()

Unnamed: 0,customer_id,article_id,cf_score
15,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,524529010,0.24
42,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016002,0.232759
71,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,789060001,0.222222
19,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,554479005,0.217105
93,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,909924001,0.181818


In [40]:
cf.to_csv('./collaborative_filtering.csv', index=False)

# Qualitative Analyis

#### Co-occurence

In [8]:
co_occurence[co_occurence['article_id_x']==108775015].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence
339,108775015,538699001,Strap top Vest top Garment Upper body Solid La...,V-neck strap top Vest top Garment Upper body S...,16
0,108775015,108775044,Strap top Vest top Garment Upper body Solid La...,Strap top Vest top Garment Upper body Solid La...,14
92,108775015,372860001,Strap top Vest top Garment Upper body Solid La...,7p Basic Shaftless Socks Socks & Tights Solid ...,12
340,108775015,538699007,Strap top Vest top Garment Upper body Solid La...,V-neck strap top Vest top Garment Upper body S...,9
407,108775015,554772002,Strap top Vest top Garment Upper body Solid La...,Bob V-neck T-shirt Garment Upper body Solid La...,9


In [33]:
co_occurence[co_occurence['article_id_x']==722416001].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence
6770762,722416001,616711011,DIV Xena jacket Blazer Garment Upper body Soli...,ED Baraboom cardigan Cardigan Garment Upper bo...,3
6770791,722416001,664122002,DIV Xena jacket Blazer Garment Upper body Soli...,&DENIM Shaping bootcut Trousers Garment Lower ...,3
6770737,722416001,294008002,DIV Xena jacket Blazer Garment Upper body Soli...,HM+ Cora tee Costumes Garment Full body Solid ...,2
6770751,722416001,573085044,DIV Xena jacket Blazer Garment Upper body Soli...,Madison skinny HW (1) Trousers Garment Lower b...,2
6770770,722416001,617828002,DIV Xena jacket Blazer Garment Upper body Soli...,DIV Charlie Aline skirt Skirt Garment Lower bo...,2


#### Collaborative Filtering

In [37]:
i2i[i2i['article_id_x']==108775015].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence,union,cf_score
0,108775015,108775044,Strap top Vest top Garment Upper body Solid La...,Strap top Vest top Garment Upper body Solid La...,14,122,0.114754
339,108775015,538699001,Strap top Vest top Garment Upper body Solid La...,V-neck strap top Vest top Garment Upper body S...,16,147,0.108844
340,108775015,538699007,Strap top Vest top Garment Upper body Solid La...,V-neck strap top Vest top Garment Upper body S...,9,120,0.075
1123,108775015,670691003,Strap top Vest top Garment Upper body Solid La...,PE HEIDI CASHMERE Sweater Garment Upper body S...,6,81,0.074074
407,108775015,554772002,Strap top Vest top Garment Upper body Solid La...,Bob V-neck T-shirt Garment Upper body Solid La...,9,126,0.071429


In [38]:
i2i[i2i['article_id_x']==722416001].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence,union,cf_score
6770791,722416001,664122002,DIV Xena jacket Blazer Garment Upper body Soli...,&DENIM Shaping bootcut Trousers Garment Lower ...,3,4,0.75
6770837,722416001,749665002,DIV Xena jacket Blazer Garment Upper body Soli...,&DENIM Embrace denim shorts Shorts Garment Low...,2,3,0.666667
6770857,722416001,780204001,DIV Xena jacket Blazer Garment Upper body Soli...,DIV Boy denim shorts Shorts Garment Lower body...,2,3,0.666667
6770770,722416001,617828002,DIV Xena jacket Blazer Garment Upper body Soli...,DIV Charlie Aline skirt Skirt Garment Lower bo...,2,4,0.5
6770776,722416001,627108001,DIV Xena jacket Blazer Garment Upper body Soli...,Amy padded bra BIG Bra Underwear Solid Ladiesw...,2,4,0.5
