## Content-based recommendation (using cosine similarity)
### Use article description to acquire similarity between articles

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#read Menswear data set
transaction_men_raw = pd.read_csv("../data/large_data/transaction_2019_567_Menswear.csv")
transaction_men_raw.head()

Unnamed: 0.1,Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,month,week,product_code,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,60,2019-05-01,00357b192b81fc83261a45be87f5f3d59112db7d117513...,743719001,0.050831,2,2019,5,18,743719,...,Shoes,F,Menswear,3,Menswear,27,Men Shoes,1020,Shoes,Cotton canvas trainers with a padded edge and ...
1,140,2019-05-01,0083ee250b3845008465de0e938d0ed2ae4f5bfde8b56e...,507431031,0.015237,2,2019,5,18,507431,...,Jersey inactive from s1,F,Menswear,3,Menswear,55,Contemporary Street,1005,Jersey Fancy,T-shirt in hard-washed slub cotton jersey with...
2,141,2019-05-01,0083ee250b3845008465de0e938d0ed2ae4f5bfde8b56e...,378447036,0.05422,2,2019,5,18,378447,...,Knitwear,F,Menswear,3,Menswear,23,Men Suits & Tailoring,1003,Knitwear,Jumper in fine-knit merino wool with a V-neck ...
3,142,2019-05-01,0083ee250b3845008465de0e938d0ed2ae4f5bfde8b56e...,657850001,0.030492,2,2019,5,18,657850,...,Knitwear,F,Menswear,3,Menswear,20,Contemporary Smart,1003,Knitwear,Jumper in a textured-knit wool blend with long...
4,143,2019-05-01,0083ee250b3845008465de0e938d0ed2ae4f5bfde8b56e...,598755015,0.013542,2,2019,5,18,598755,...,Light Basic Jersey,F,Menswear,3,Menswear,26,Men Underwear,1002,Jersey Basic,"Long, round-necked T-shirt in soft jersey with..."


In [3]:
print(transaction_men_raw.columns)

Index(['Unnamed: 0', 't_dat', 'customer_id', 'article_id', 'price',
       'sales_channel_id', 'year', 'month', 'week', 'product_code',
       'prod_name', 'product_type_no', 'product_type_name',
       'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')


In [28]:
transaction_men_raw['details_description'] = transaction_men_raw['product_type_name'] + " " + \
                                             transaction_men_raw['product_group_name'] + " " + \
                                             transaction_men_raw['graphical_appearance_name'] + " " + \
                                             transaction_men_raw['colour_group_name'] + " " + \
                                             transaction_men_raw['perceived_colour_value_name'] + " " + \
                                             transaction_men_raw['perceived_colour_master_name'] + " " + \
                                             transaction_men_raw['department_name'] + " " + \
                                             transaction_men_raw['section_name'] + " " + \
                                             transaction_men_raw['garment_group_name'] + " " + \
                                             transaction_men_raw['detail_desc']
 

In [30]:
transaction_men_raw.details_description.value_counts()

T-shirt Garment Upper body Solid White Light White Light Basic Jersey Men Underwear Jersey Basic Round-necked T-shirt in soft cotton jersey.                                                                                                                                                                                                                                                                                                      1871
T-shirt Garment Upper body Front print White Light White Jersey Fancy Contemporary Smart Jersey Fancy T-shirt in printed cotton jersey.                                                                                                                                                                                                                                                                                                           1732
T-shirt Garment Upper body Solid Black Dark Black Light Basic Jersey Men Underwear Jersey Basic Round-necked T-shirt in so

In [5]:
content = transaction_men_raw.detail_desc.value_counts().index

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

X.shape

(1923, 978)

In [6]:
similarity = cosine_similarity(X)

In [8]:
similarity.shape

(1923, 1923)

In [31]:
##subset data with article_id and detail_desc

article_desc_df = transaction_men_raw[['article_id','details_description']].drop_duplicates().dropna().reset_index(drop=True)
article_desc_df

Unnamed: 0,article_id,details_description
0,743719001,Sneakers Shoes Solid Grey Medium Dusty Grey Sh...
1,507431031,T-shirt Garment Upper body Treatment Black Dar...
2,378447036,Sweater Garment Upper body Melange Dark Blue D...
3,657850001,Sweater Garment Upper body Melange Dark Grey D...
4,598755015,T-shirt Garment Upper body Melange Greenish Kh...
...,...,...
5219,652346002,Underwear bottom Underwear Check Light Red Med...
5220,809223001,Trousers Garment Lower body Solid Greenish Kha...
5221,755780003,Sweater Garment Upper body Colour blocking Whi...
5222,786161002,Underwear bottom Underwear Solid White Light W...


In [32]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(article_desc_df['details_description'])
X.shape

(5224, 1081)

In [73]:
similarity = cosine_similarity(X)
similarity.shape

(5224, 5224)

In [74]:
similarity[0]

array([1.        , 0.08852791, 0.03685904, ..., 0.02976935, 0.0310341 ,
       0.07560741])

In [79]:
np.sort(similarity[0])

array([0.        , 0.00281989, 0.00288001, ..., 0.90912136, 0.92070483,
       1.        ])

In [37]:
#sort index of similarity 
sorted_sim_index = np.fliplr(np.argsort(similarity))

In [41]:
sorted_sim_index[1]

array([   1, 4450, 1994, ..., 3901, 1171, 3763], dtype=int64)

In [42]:
#the 10 most similar items to the item(index = 0)
articleID = article_desc_df.article_id[0]

similar_items_index = sorted_sim_index[0][1:11]

In [43]:
articleID

743719001

In [44]:
similar_items_index

array([2668, 2889, 5152,  701, 4957, 4979, 3582, 3635, 4188, 4009],
      dtype=int64)

In [48]:
article_desc_df.article_id[similar_items_index]

2668    601728002
2889    601728001
5152    601728027
701     601728013
4957    728836001
4979    766099001
3582    728788002
3635    671809009
4188    601728016
4009    728836003
Name: article_id, dtype: int64

In [50]:
mask_train = transaction_men_raw.week == 18
X_train = transaction_men_raw.loc[mask_train].reset_index(drop=True)
mask_test = transaction_men_raw.week == 19
X_test = transaction_men_raw.loc[mask_test].reset_index(drop=True)[['customer_id','article_id']]

In [None]:


#concatenate all text information
X_train['details_description'] = X_train['product_type_name'] + " " + \
                                 X_train['product_group_name'] + " " + \
                                 X_train['graphical_appearance_name'] + " " + \
                                 X_train['colour_group_name'] + " " + \
                                 X_train['perceived_colour_value_name'] + " " + \
                                 X_train['perceived_colour_master_name'] + " " + \
                                 X_train['department_name'] + " " + \
                                 X_train['section_name'] + " " + \
                                 X_train['garment_group_name'] + " " + \
                                 X_train['detail_desc']

#subset article_id and description column
article_desc_df = X_train[['article_id','details_description']].drop_duplicates().dropna().reset_index(drop=True)

#TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(article_desc_df['details_description'])

#Calculate cosine similarity between articles
similarity = cosine_similarity(X)

#sort index of similarity
sorted_sim_index = np.fliplr(np.argsort(similarity))

#find customers in training and their purchases items
Customer_IDs = X_train.customer_id.unique()

#dict to store each 
recommend_items = {}

for customer in Customer_IDs:
    #find purchased article_id
    purchased = X_train.loc[X_train.customer_id == customer].article_id
    
    #for each article, find its 12 most similar items' similarites
    for each in purchased:
        item_index = np.where(article_desc_df.article_id == each)[0][0]
    
    
        

In [61]:
Customer_IDs = X_train.customer_id.unique()
customer = Customer_IDs[2]
customer

'00be0a263381af38132d31225e8fb12fbc527c654b446484fb672a78118f037d'

In [62]:
purchased = X_train.loc[X_train.customer_id == customer].article_id
purchased

14    589549018
15    685604015
16    732675001
17    669385001
Name: article_id, dtype: int64

In [67]:
article_desc_df

Unnamed: 0,article_id,details_description
0,743719001,Sneakers Shoes Solid Grey Medium Dusty Grey Sh...
1,507431031,T-shirt Garment Upper body Treatment Black Dar...
2,378447036,Sweater Garment Upper body Melange Dark Blue D...
3,657850001,Sweater Garment Upper body Melange Dark Grey D...
4,598755015,T-shirt Garment Upper body Melange Greenish Kh...
...,...,...
5219,652346002,Underwear bottom Underwear Check Light Red Med...
5220,809223001,Trousers Garment Lower body Solid Greenish Kha...
5221,755780003,Sweater Garment Upper body Colour blocking Whi...
5222,786161002,Underwear bottom Underwear Solid White Light W...


In [72]:
np.where(article_desc_df.article_id == 598755015)[0][0]

4