# Importing Libraries

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from warnings import filterwarnings
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
filterwarnings('ignore')

# Importing Datasets

The dataset is taken from Kaggle (https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations)

In [2]:
articles = pd.read_csv("articles.csv")
customers = pd.read_csv("customers.csv")
transactions = pd.read_csv("transactions_train.csv")

## Articles Dataset

This dataset contains products and related information about them. Rows with null values were removed from the data set.

In [3]:
articles = articles.dropna()

In [4]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105126 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105126 non-null  int64 
 1   product_code                  105126 non-null  int64 
 2   prod_name                     105126 non-null  object
 3   product_type_no               105126 non-null  int64 
 4   product_type_name             105126 non-null  object
 5   product_group_name            105126 non-null  object
 6   graphical_appearance_no       105126 non-null  int64 
 7   graphical_appearance_name     105126 non-null  object
 8   colour_group_code             105126 non-null  int64 
 9   colour_group_name             105126 non-null  object
 10  perceived_colour_value_id     105126 non-null  int64 
 11  perceived_colour_value_name   105126 non-null  object
 12  perceived_colour_master_id    105126 non-null  int64 
 13 

Only NLP-related variables were selected from the dataset and all those variables containing text were combined in one column with the name "text". Since various numeric values will not be used, they were not selected.

In [5]:
articles["text"] = articles["prod_name"].map(str) + " " + articles["product_type_name"] +" "+ articles["product_group_name"]+ " "+ articles['graphical_appearance_name']+" "+ articles['colour_group_name'] +" "+ articles['perceived_colour_value_name']+ " " + articles["perceived_colour_master_name"] +" "+ articles["department_name"]+ " "+ articles['index_name']+" "+articles['index_group_name'] +" "+articles['section_name']+ " "+ articles['garment_group_name']+" "+articles['detail_desc']
articles.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,text
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,Strap top Vest top Garment Upper body Solid Bl...
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,Strap top Vest top Garment Upper body Solid Wh...


Finally, a dataframe created only includes 'article_id', 'product_code', 'text' columns.

In [6]:
df_all = articles[['article_id', 'product_code', 'text']]

In [7]:
df_all.head()

Unnamed: 0,article_id,product_code,text
0,108775015,108775,Strap top Vest top Garment Upper body Solid Bl...
1,108775044,108775,Strap top Vest top Garment Upper body Solid Wh...
2,108775051,108775,Strap top (1) Vest top Garment Upper body Stri...
3,110065001,110065,OP T-shirt (Idro) Bra Underwear Solid Black Da...
4,110065002,110065,OP T-shirt (Idro) Bra Underwear Solid White Li...


The text variable needs to be cleared for NLP implementation. For this reason, the necessary files have been downloaded.

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/berkeakkaya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/berkeakkaya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/berkeakkaya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/berkeakkaya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Text cleaning function defined and applied on text variable

In [9]:
stop = stopwords.words('english')
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2   
  
def clean_txt(text):
  clean_text = []
  clean_text2 = []
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text) 
  text = text.replace("nbsp", "")
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
  clean_text2 = [word for word in clean_text if black_txt(word)]
  return " ".join(clean_text2)

In [10]:
df_all['text'] = df_all['text'].apply(clean_txt)

In [11]:
df_all.head()

Unnamed: 0,article_id,product_code,text
0,108775015,108775,strap top vest top garment upper body solid bl...
1,108775044,108775,strap top vest top garment upper body solid wh...
2,108775051,108775,strap top vest top garment upper body stripe w...
3,110065001,110065,shirt idro bra underwear solid black dark blac...
4,110065002,110065,shirt idro bra underwear solid white light whi...


Initializing tfidf vectorizer for articles, fitting and transforming the vector

In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_article = tfidf_vectorizer.fit_transform((df_all['text'])) 
tfidf_article

<105126x13313 sparse matrix of type '<class 'numpy.float64'>'
	with 2944800 stored elements in Compressed Sparse Row format>

## Transactions Dataset

In [13]:
transactions = transactions.dropna()

Sorting the dataset by customer id to see all of a customer's purchases

In [14]:
transactions =  transactions.sort_values(by='customer_id')
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
14479579,2019-07-25,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,745232001,0.021169,1
23574622,2020-03-21,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,890498002,0.031763,2
23574621,2020-03-21,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,887593002,0.021593,2
23574620,2020-03-21,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,841260003,0.011508,2
4212358,2018-12-27,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001,0.044051,1


Merging the text&product_id (df_all) dataset with transactions dataset to match article_ids with purchases made by a customer.

In [15]:
merged_df = df_all.merge(transactions, how = 'inner', on = ['article_id'])

The text information of all the products purchased by the user are gathered in the same 'text' variable.

In [16]:
merged_df2 = merged_df.groupby('customer_id', sort=False)['text'].apply(' '.join).reset_index()
merged_df2.head(5)

Unnamed: 0,customer_id,text
0,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,strap top vest top garment upper body solid bl...
1,001ae5408a043f64bccd32beffe2730151414cbdf18a6e...,strap top vest top garment upper body solid bl...
2,001ba9e81e13ce12a2585d9ebde923fe74429e9e12ea59...,strap top vest top garment upper body solid bl...
3,0022a721371d5949d174ecba60346d89a9d6c08c0fba4f...,strap top vest top garment upper body solid bl...
4,002323971cbd38fad4512d5114676e5e17eb262db02320...,strap top vest top garment upper body solid bl...


# **Recommendation**

A random customer_id was chosen to make a reccommendation

In [17]:
u = "000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318" #customer_id
index = np.where(merged_df2['customer_id'] == u)[0][0]
cust_q = merged_df2.iloc[[index]]
cust_q

Unnamed: 0,customer_id,text
317613,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,lazer razer brief swimwear bottom swimwear sol...


### Products user bought before

In [18]:
transactions.loc[transactions['customer_id'] == u]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
24773518,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,858883002,0.030492,2
24773520,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,750424014,0.042356,2
24773521,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,750424014,0.042356,2
24773522,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,870304002,0.033881,2
24773523,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,870304002,0.033881,2
24773524,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,852643001,0.025407,2
24773525,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,852643003,0.025407,2
21953952,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,351484002,0.022017,2
21953951,2020-02-03,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,351484002,0.022017,2
24773519,2020-04-18,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,851400006,0.059305,2


## Define a Reccommendation Function 

Recommendation function includes customer ID, article ID, product code, description and similarity score.

In [19]:
def recommendation_product(top, df_all, scores):
  recommendation = pd.DataFrame(columns = ['customer_id', 'article_id',  'product_code', 'detail_desc', 'score'])
  count = 0
  for i in top:
      recommendation.at[count, 'customer_id'] = u
      recommendation.at[count, 'article_id'] = df_all['article_id'][i]
      recommendation.at[count, 'product_code'] = df_all['product_code'][i]
      recommendation.at[count, 'detail_desc'] = articles['detail_desc'][i]   
      recommendation.at[count, 'score'] =  scores[count]
      count += 1
  return recommendation

## Calculating Cosine Similarity for the User

In [20]:
user_tfidf = tfidf_vectorizer.transform(cust_q['text'])
cos_similarity_tfidf = map(lambda x: cosine_similarity(user_tfidf, x),tfidf_article)

In [21]:
output2 = list(cos_similarity_tfidf)

## Recommendations with TFIDF

In [23]:
top = sorted(range(len(output2)), key=lambda i: output2[i], reverse=True)[:10]
tf_list_scores = [output2[i][0][0] for i in top]
recommendation_product(top, df_all, tf_list_scores)

Unnamed: 0,customer_id,article_id,product_code,detail_desc,score
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,821648004,821648,Quilted top in sturdy sweatshirt fabric with a...,0.557127
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,821648003,821648,Quilted top in sturdy sweatshirt fabric with a...,0.544008
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,662879008,662879,Fancy dress cape in jersey with a concealed ho...,0.536211
3,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,721991003,721991,Long-sleeved top in cotton jersey with a print...,0.535286
4,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,721991004,721991,Long-sleeved top in cotton jersey with a print...,0.523437
5,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,458428031,458428,5-pocket jeans in washed stretch denim with a ...,0.516344
6,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,458428037,458428,5-pocket jeans in washed stretch denim with a ...,0.516344
7,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,865034001,865034,"Short, wide dress in airy, patterned chiffon w...",0.506835
8,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,589832001,589832,"Short, fitted off-the-shoulder dress in stretc...",0.50571
9,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,756859003,756859,Romper suit in soft cotton jersey with a print...,0.505697


In [24]:
tf_idf_score=pd.DataFrame(recommendation_product(top, df_all, tf_list_scores), columns = ['article_id', 'score'])

## Reccomendations with CountVectorizer

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

count_artid = count_vectorizer.fit_transform((df_all['text'])) #fitting and transforming the vector
count_artid

<105126x13313 sparse matrix of type '<class 'numpy.int64'>'
	with 2944800 stored elements in Compressed Sparse Row format>

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
user_count = count_vectorizer.transform(cust_q['text'])
cos_similarity_countv = map(lambda x: cosine_similarity(user_count, x),count_artid)

In [27]:
output3 = list(cos_similarity_countv)

In [28]:
top = sorted(range(len(output3)), key=lambda i: output3[i], reverse=True)[:10]
list_scores_cv = [output3[i][0][0] for i in top]
recommendation_product(top, df_all, list_scores_cv)

Unnamed: 0,customer_id,article_id,product_code,detail_desc,score
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541491004,541491,Round-necked jersey T-shirt in a cotton blend.,0.658103
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,709666001,709666,Fitted jumper in a soft rib knit with a V-neck...,0.603851
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,840566001,840566,Polo-neck jumper in a soft rib knit containing...,0.601954
3,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,665766002,665766,Chinos in washed stretch cotton twill with an ...,0.600663
4,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,696427006,696427,Top in sweatshirt fabric with a print motif on...,0.594416
5,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,679287003,679287,"Soft, non-wired microfibre bra with lace detai...",0.592057
6,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,767090001,767090,"5-pocket jeans in washed, stretch denim with a...",0.589555
7,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,857545002,857545,Waterproof trainers in mesh with imitation lea...,0.589555
8,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,761414002,761414,Shorts in a cotton weave with broderie anglais...,0.588778
9,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,702842010,702842,T-shirt in cotton jersey with a print motif on...,0.587721


In [29]:
cv_score=pd.DataFrame(recommendation_product(top, df_all, list_scores_cv), columns = ['article_id', 'score'])

## Reccommendations with KNN

In [30]:
from sklearn.neighbors import NearestNeighbors
KNN = NearestNeighbors(n_neighbors=11)
KNN.fit(tfidf_article)
NNs = KNN.kneighbors(user_tfidf, return_distance=True) 

In [31]:
top = NNs[1][0][1:]
index_score = NNs[0][0][1:]
recommendation_product(top, df_all, index_score)

Unnamed: 0,customer_id,article_id,product_code,detail_desc,score
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,821648003,821648,Quilted top in sturdy sweatshirt fabric with a...,0.954979
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,662879008,662879,Fancy dress cape in jersey with a concealed ho...,0.963108
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,721991003,721991,Long-sleeved top in cotton jersey with a print...,0.964068
3,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,721991004,721991,Long-sleeved top in cotton jersey with a print...,0.976282
4,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,458428031,458428,5-pocket jeans in washed stretch denim with a ...,0.98352
5,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,458428037,458428,5-pocket jeans in washed stretch denim with a ...,0.98352
6,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,865034001,865034,"Short, wide dress in airy, patterned chiffon w...",0.993141
7,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,589832001,589832,"Short, fitted off-the-shoulder dress in stretc...",0.994274
8,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,756859003,756859,Romper suit in soft cotton jersey with a print...,0.994287
9,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,721991006,721991,Long-sleeved top in cotton jersey with a print...,0.994805


In [32]:
knn_score=pd.DataFrame(recommendation_product(top, df_all, index_score), columns = ['article_id', 'score'])

# Comparison 

In [33]:
tf_idf_score=tf_idf_score.rename(columns={"score":"tf_idf_score"})
cv_score=cv_score.rename(columns={"score":"cv_score"})
knn_score=knn_score.rename(columns={"score":"knn_score"})

In [34]:
pd.concat([tf_idf_score, cv_score, knn_score], axis=1)

Unnamed: 0,article_id,tf_idf_score,article_id.1,cv_score,article_id.2,knn_score
0,821648004,0.557127,541491004,0.658103,821648003,0.954979
1,821648003,0.544008,709666001,0.603851,662879008,0.963108
2,662879008,0.536211,840566001,0.601954,721991003,0.964068
3,721991003,0.535286,665766002,0.600663,721991004,0.976282
4,721991004,0.523437,696427006,0.594416,458428031,0.98352
5,458428031,0.516344,679287003,0.592057,458428037,0.98352
6,458428037,0.516344,767090001,0.589555,865034001,0.993141
7,865034001,0.506835,857545002,0.589555,589832001,0.994274
8,589832001,0.50571,761414002,0.588778,756859003,0.994287
9,756859003,0.505697,702842010,0.587721,721991006,0.994805


It seems that while knn and tf-idf make **almost** the same recommendations, the system based on countvectorizer makes different recommendations.