In [2]:
import pandas as pd
import numpy as np

In [41]:
# read sample transaction data
data = pd.read_csv("../../data/transactions.csv")
data.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price
0,2018-09-20,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,652075001,0.011847
1,2018-09-20,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,670295001,0.010153


In [42]:
# read item metadata for reference
items = pd.read_csv("../../data/articles.csv")
items = items[
    [
        "article_id",
        "prod_name",
        "product_type_name",
        "product_group_name",
        "graphical_appearance_name",
        "index_group_name",
        "section_name",
        "colour_group_name",
        "perceived_colour_value_name",
    ]
].copy()

# combining all columns except the id and performing ordered deduplication
items["combined"] = items.iloc[:, 1:].agg(" ".join, axis=1)
items = items[["article_id", "combined"]].copy()

items.head(2)

Unnamed: 0,article_id,combined
0,108775015,Strap top Vest top Garment Upper body Solid La...
1,108775044,Strap top Vest top Garment Upper body Solid La...


In [43]:
i2i = data[["customer_id", "article_id"]].copy()
i2i = i2i.merge(items[["article_id", "combined"]], on=["article_id"], how="left")
pidcount = dict(i2i.groupby(["article_id"]).size().reset_index(name="count").values)

i2i = i2i.merge(i2i, on=["customer_id"])
i2i = i2i[i2i["article_id_x"] != i2i["article_id_y"]].copy()
i2i.head(2)

Unnamed: 0,customer_id,article_id_x,combined_x,article_id_y,combined_y
1,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,652075001,Stork fancy T-shirt Garment Upper body Solid D...,670295001,CSP Hackney tank Vest top Garment Upper body S...
2,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,652075001,Stork fancy T-shirt Garment Upper body Solid D...,678153002,Stud Jumper Sweater Garment Upper body Melange...


# Co-occurence

In [44]:
i2i = (
    i2i.groupby(["article_id_x", "article_id_y", "combined_x", "combined_y"])
    .size()
    .reset_index(name="co_occurence")
)
i2i = i2i[i2i["co_occurence"] > 1].copy()
co_occur = (
    i2i.sort_values(["article_id_x", "co_occurence"], ascending=[True, False])
    .groupby(["article_id_x"], sort=False)
    .head(10)
)

In [45]:
co_occurence = (
    data[["customer_id", "article_id"]]
    .drop_duplicates()
    .merge(co_occur.rename(columns={"article_id_x": "article_id"}), on=["article_id"])
)

co_occurence = (
    co_occurence.groupby(["customer_id", "article_id_y"])
    .agg({"co_occurence": "sum"})
    .reset_index()
)

co_occurence = co_occurence[["customer_id", "article_id_y", "co_occurence"]].rename(
    columns={"article_id_y": "article_id"}
)
co_occurence.sort_values(
    ["customer_id", "co_occurence"], ascending=[True, False], inplace=True
)
co_occurence.head()

Unnamed: 0,customer_id,article_id,co_occurence
60,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016002,184
61,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016003,101
63,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016015,81
62,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016006,66
29,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,539723001,56


In [35]:
co_occurence.to_csv("./co_occurence_recommendation.csv", index=False)

# Collaborative Filtering

In [46]:
i2i["union"] = (
    i2i["article_id_x"].map(pidcount)
    + i2i["article_id_y"].map(pidcount)
    - i2i["co_occurence"]
)

i2i["cf_score"] = i2i["co_occurence"] / i2i["union"]
i2i = (
    i2i.sort_values(["article_id_x", "cf_score"], ascending=[True, False])
    .groupby(["article_id_x"], sort=False)
    .head(10)
)

i2i.head(2)

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence,union,cf_score
0,108775015,108775044,Strap top Vest top Garment Upper body Solid La...,Strap top Vest top Garment Upper body Solid La...,14,122,0.114754
339,108775015,538699001,Strap top Vest top Garment Upper body Solid La...,V-neck strap top Vest top Garment Upper body S...,16,147,0.108844


In [39]:
cf = (
    data[["customer_id", "article_id"]]
    .drop_duplicates()
    .merge(i2i.rename(columns={"article_id_x": "article_id"}), on=["article_id"])
)

cf = (
    cf.groupby(["customer_id", "article_id_y"])
    .agg({"co_occurence": "sum", "union": "sum"})
    .reset_index()
)
cf["cf_score"] = cf["co_occurence"] / cf["union"]

cf = cf[["customer_id", "article_id_y", "cf_score"]].rename(
    columns={"article_id_y": "article_id"}
)
cf.sort_values(["customer_id", "cf_score"], ascending=[True, False], inplace=True)
cf.head()

Unnamed: 0,customer_id,article_id,cf_score
15,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,524529010,0.24
42,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,706016002,0.232759
71,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,789060001,0.222222
19,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,554479005,0.217105
93,000fa62c9e64d11bc25c530736949fd8dfc9a39d50c453...,909924001,0.181818


In [40]:
cf.to_csv("./collaborative_filtering.csv", index=False)

# Qualitative Analyis

##### Co-occurence

In [50]:
co_occur[co_occur["article_id_x"] == 776237011].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence
9169438,776237011,599580038,Shake it in Balconette Bikini top Swimwear Sol...,Timeless Midrise Brief Swimwear bottom Swimwea...,57
9170556,776237011,776237006,Shake it in Balconette Bikini top Swimwear Sol...,Shake it in Balconette Bikini top Swimwear Sol...,10
9170739,776237011,806225008,Shake it in Balconette Bikini top Swimwear Sol...,All That Jazz Push Up Bra Bikini top Swimwear ...,10
9169070,776237011,351484002,Shake it in Balconette Bikini top Swimwear Sol...,Lazer Razer Brief Swimwear bottom Swimwear Sol...,9
9169444,776237011,599580052,Shake it in Balconette Bikini top Swimwear Sol...,Timeless Midrise Brief Swimwear bottom Swimwea...,9


In [64]:
co_occur[co_occur["article_id_x"] == 608776002].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence
2480841,608776002,372860001,Scallop 5p Socks Socks Socks & Tights Solid La...,7p Basic Shaftless Socks Socks & Tights Solid ...,49
2480842,608776002,372860002,Scallop 5p Socks Socks Socks & Tights Solid La...,7p Basic Shaftless Socks Socks & Tights Solid ...,29
2480680,608776002,160442007,Scallop 5p Socks Socks Socks & Tights Solid La...,3p Sneaker Socks Socks Socks & Tights Solid La...,22
2480665,608776002,111586001,Scallop 5p Socks Socks Socks & Tights Solid La...,Shape Up 30 den 1p Tights Leggings/Tights Garm...,17
2481992,608776002,608776003,Scallop 5p Socks Socks Socks & Tights Solid La...,Scallop 5p Socks Socks Socks & Tights Contrast...,17


##### Collaborative Filtering

In [66]:
i2i[i2i["article_id_x"] == 776237011].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence,union,cf_score
9169438,776237011,599580038,Shake it in Balconette Bikini top Swimwear Sol...,Timeless Midrise Brief Swimwear bottom Swimwea...,57,108,0.527778
9170739,776237011,806225008,Shake it in Balconette Bikini top Swimwear Sol...,All That Jazz Push Up Bra Bikini top Swimwear ...,10,87,0.114943
9170556,776237011,776237006,Shake it in Balconette Bikini top Swimwear Sol...,Shake it in Balconette Bikini top Swimwear Sol...,10,92,0.108696
9169344,776237011,571706010,Shake it in Balconette Bikini top Swimwear Sol...,Timeless High Rise Hipster Swimwear bottom Swi...,7,74,0.094595
9169569,776237011,633778001,Shake it in Balconette Bikini top Swimwear Sol...,Halloween SPEED A-band Bat Hair/alice band Acc...,6,68,0.088235


In [65]:
i2i[i2i["article_id_x"] == 608776002].head()

Unnamed: 0,article_id_x,article_id_y,combined_x,combined_y,co_occurence,union,cf_score
2480841,608776002,372860001,Scallop 5p Socks Socks Socks & Tights Solid La...,7p Basic Shaftless Socks Socks & Tights Solid ...,49,567,0.08642
2485325,608776002,761968002,Scallop 5p Socks Socks Socks & Tights Solid La...,Elisabeth dress (1) Dress Garment Full body So...,15,239,0.062762
2481992,608776002,608776003,Scallop 5p Socks Socks Socks & Tights Solid La...,Scallop 5p Socks Socks Socks & Tights Contrast...,17,279,0.060932
2480842,608776002,372860002,Scallop 5p Socks Socks Socks & Tights Solid La...,7p Basic Shaftless Socks Socks & Tights Solid ...,29,514,0.05642
2480680,608776002,160442007,Scallop 5p Socks Socks Socks & Tights Solid La...,3p Sneaker Socks Socks Socks & Tights Solid La...,22,409,0.05379
