## Data Cleaning

In [26]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', 50)


import datetime

Let's try surprise library for the collaborative recommender system. For this we want three columns -
- customer id
- article id 
- 1/0 for either the customer bought the article or not

So for every customer in the transactions df, we want to see which article did they buy or not. These articles are only the ones from the transactions df. 

Data Cleaning and EDA again

In [None]:
transactions = pd.read_csv('Data/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

In [None]:
transactions.shape

In [None]:
transactions['InvoiceDate'] = pd.to_datetime(transactions['t_dat'],format='%Y-%m-%d')
transactions=transactions[["InvoiceDate", "customer_id", "article_id", "price","sales_channel_id"]].drop_duplicates()

In [None]:
transactions.shape

Shape of the transactions df reduced from 31,788,324 transactions to 28,813,419.

In [None]:
transactions.head(3)

In [None]:
transactions.info()

In [None]:
# unique customer ids in the transactions df 

# len(transactions['customer_id'].unique())
# unique_customers = transactions['customer_id'].unique().tolist()
# unique_customers

In [None]:
# unique article ids in the transactions df 

# len(transactions['article_id'].unique())
# unique_articles = transactions['article_id'].unique().tolist()
# unique_articles

Now we know there are 1,362,281 (1,371,980 from customers df) customers who have transacted with 104,547 articles (105,542 from articles df). Now, we want to add a column in df where we know if a customer bought an article or not. 

In [None]:
# purchase_df = transactions[['customer_id', 'article_id']]
# purchase_df['purchase'] = 1
# purchase_df.reset_index()
# purchase_df.head(3)


In [None]:
# purchase_df.isna().sum()

In [None]:
# unique_customers

In [None]:
# purchase_df.head(1)

In [None]:

# for customer in unique_customers:
# #     print(customer)
#     reqd_index = purchase_df.index[purchase_df['customer_id'] == customer].tolist()
# #     print(customer)
# #     print(reqd_index)
#     articles_bought = [items for items in purchase_df['article_id'][[index for index in reqd_index]]]
# #     print(len(articles_bought))
# #now add rows for customer id and articles not in the articles bought list and 0 in purchase column
#     articles_not_bought = [item for item in unique_articles if item not in articles_bought]
# #     print(len(articles_not_bought))
#     for article in articles_not_bought:
#         row = {'customer_id': customer, 'article_id': article, 'purchase': 0}
#         purchase_df = purchase_df.append(row, ignore_index = True)
# #         break
# #     break

In [None]:
# purchase_df.tail()

In [None]:
# len(unique_articles)

In [None]:
# purchase_df.loc[[23934158]]

In [None]:
# purchase_df.loc[[0]]

# RFM Analysis - 

RFM analysis is a marketing technique used to quantitatively rank and group customers based on the recency, frequency and monetary total of their recent transactions to identify the best customers and perform targeted marketing campaigns.

In [None]:
# import required libraries for clustering
# import sklearn
# from sklearn.preprocessing import StandardScaler
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# from scipy.cluster.hierarchy import linkage
# from scipy.cluster.hierarchy import dendrogram
# from scipy.cluster.hierarchy import cut_tree
# from sklearn.metrics.pairwise import cosine_similarity




In [None]:
start_date = datetime.datetime(2020,3,1)

# Filter transactions by date
transactions["t_dat"] = pd.to_datetime(transactions["InvoiceDate"])
transactions = transactions.loc[transactions["t_dat"] >= start_date]

In [None]:
#analysis_date = max(transactions['InvoiceDate']) + dt.timedelta(days= 1)
analysis_date=datetime.datetime(2020,9,23)
print((analysis_date).date())

In [None]:
transactions['date']=transactions['InvoiceDate']
(analysis_date - transactions['InvoiceDate'].max()).days

In [None]:
rfm = transactions.groupby('customer_id').agg({
    'InvoiceDate': lambda x: (analysis_date - x.max()).days,
    'date': 'count',
    'price': 'sum'})

In [None]:
rfm.head()

In [None]:
rfm.columns=["Recency","Frequency","Monetary"]
rfm = rfm[rfm["Monetary"] > 0]

In [None]:
#Date from customer's last purchase.The nearest date gets 5 and the furthest date gets 1.
rfm["recency_score"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])
# Total number of purchases.The least frequency gets 1 and the maximum frequency gets 5.
rfm["frequency_score"] = pd.qcut(rfm["Frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
#Total spend by the customer.The least money gets 1, the most money gets 5.
rfm["monetary_score"]= pd.qcut(rfm["Monetary"],5,labels=[1,2,3,4,5])
rfm.head()

In [None]:
#RFM - The value of 2 different variables that were formed was recorded as a RFM_SCORE
rfm["RFM_SCORE"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str))

In [None]:
seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
}
rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)
rfm.head()

In [None]:
rfm[["segment", "Recency","Frequency","Monetary"]].groupby("segment").agg(["mean","count","max"]).round()

In [None]:
import plotly.express as px

In [None]:
x = rfm.segment.value_counts()
fig = px.treemap(x, path=[x.index], values=x)
fig.update_layout(title_text='Distribution of the RFM Segments', title_x=0.5,
                  title_font=dict(size=20))
fig.update_traces(textinfo="label+value+percent root")
fig.show()

### Item-Based Collaborative Filtering -using Probabilistic Matrix Factorization

Preparing the data We need to restrict the data respect to a minimum transaction date. In that way, we reduce the dimensionality of the problem and we get rid of transactions that are not important in terms of the time decaying popularity.

Also, we are getting rid of articles that have not been bought enough. (Minimum 10 purchases are required)

In [None]:
rfm=rfm.reset_index()

In [None]:
 transactions.head(1)

In [None]:
rfm.head(1)

In [None]:
transactions=pd.merge(transactions,rfm[["customer_id","segment"]],how='inner',on='customer_id')
training_segment = ['champions', 'potential_loyalists', 'new_customers','promising','loyal_customers']
transactions = transactions[transactions['segment'].isin(training_segment)]
transactions=transactions.drop('segment', axis=1)

In [None]:
transactions.head(2)

In [None]:
transactions.shape

In [None]:
start_date = datetime.datetime(2020,9,1)
# Filter transactions by date
transactions["t_dat"] = transactions["InvoiceDate"]
transactions = transactions.loc[transactions["InvoiceDate"] >= start_date]

In [None]:
transactions.shape

In [None]:
# Filter transactions by number of an article has been bought
article_bought_count = transactions[['article_id', 'InvoiceDate']].groupby('article_id').count().reset_index().rename(columns={'InvoiceDate': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
transactions = transactions[transactions['article_id'].isin(most_bought_articles)]
transactions["bought"]=1 

In [None]:
transactions.head()

In [None]:
transactions.shape

Due to the big amount of items, we can not consider the whole matrix in order to train. Therefore, we need to generate some negative samples: transactions that have never occured.

In [None]:
# # Generate negative samples
# np.random.seed(0)

# negative_samples = pd.DataFrame({
#     'article_id': np.random.choice(transactions.article_id.unique(), transactions.shape[0]),
#     'customer_id': np.random.choice(transactions.customer_id.unique(), transactions.shape[0]),
#     'bought': np.zeros(transactions.shape[0])
# })

Model will be based on recommendations computed through the time decaying popularity and the most similar items to those items bought the most times by each user. Similarity among items is computed through cosine distance.

In [None]:
# negative_samples.head()

In [None]:
# total_transactions = pd.concat([transactions, negative_samples])
# total_transactions

Save the transactions df to use for modeling.

In [None]:
compression_opts = dict(method='zip', archive_name='Data/out.csv')

transactions.to_csv('Data/out.zip', index=False, compression=compression_opts) 

# Meta Data for Content Based Filtering

> We are creating a dataframe to use for Content Based Filtering System. This metadata will have all the articles and we will select some columns from the articles csv to convert them into binary columns for each unique value in that feature column. This dataframe will then be used to calculate the cosine similarity and recommend based on the similarity scores.

In [27]:
# Read in the meta data and set index to article_id

articles = pd.read_csv('Data/h-and-m-personalized-fashion-recommendations/articles.csv')
articles.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


We can create the meta data for the articles using the following columns - 
- `garment_group_name` 
    - ['Jersey Fancy', 'Accessories', 'Jersey Basic', 'Knitwear',
       'Under-, Nightwear', 'Trousers', 'Blouses', 'Shoes', 'Dresses Ladies',
       'Outdoor', 'Unknown', 'Trousers Denim', 'Swimwear', 'Socks and Tights',
       'Shirts', 'Woven/Jersey/Knitted mix Baby', 'Shorts',
       'Dresses/Skirts girls', 'Skirts', 'Special Offers', 'Dressed']
- `product_group_name`
    - ['Garment Upper body', 'Garment Lower body', 'Garment Full body',
       'Accessories', 'Underwear', 'Shoes', 'Swimwear', 'Socks & Tights',
       'Nightwear', 'Unknown', 'Underwear/nightwear', 'Cosmetic', 'Bags',
       'Items', 'Furniture', 'Garment and Shoe care', 'Stationery',
       'Interior textile', 'Fun']
- `index_group_name`
    - ['Ladieswear', 'Baby/Children', 'Divided', 'Menswear', 'Sport']

We can filter the results using these columns - 
- `colour_group_name`
    - ['Black', 'Dark Blue', 'White', 'Light Pink', 'Grey', 'Light Beige',
       'Blue', 'Red', 'Light Blue', 'Greenish Khaki', 'Dark Grey', 'Off White',
       'Beige', 'Dark Red', 'Dark Green', 'Light Grey', 'Pink', 'Yellow',
       'Light Orange', 'Yellowish Brown', 'Gold', 'Dark Beige',
       'Light Turquoise', 'Light Yellow', 'Dark Orange', 'Dark Pink', 'Green',
       'Orange', 'Other Pink', 'Silver', 'Light Green', 'Dark Yellow',
       'Light Purple', 'Dark Turquoise', 'Turquoise', 'Dark Purple',
       'Light Red', 'Greyish Beige', 'Other Yellow', 'Purple', 'Other Orange',
       'Other Green', 'Other Red', 'Other', 'Bronze/Copper', 'Other Blue',
       'Other Purple', 'Transparent', 'Unknown', 'Other Turquoise']

In [28]:
len(list(['Ladieswear', 'Baby/Children', 'Divided', 'Menswear', 'Sport', 'Garment Upper body', 'Garment Lower body', 'Garment Full body',
       'Accessories', 'Underwear', 'Shoes', 'Swimwear', 'Socks & Tights',
       'Nightwear', 'Unknown', 'Underwear/nightwear', 'Cosmetic', 'Bags',
       'Items', 'Furniture', 'Garment and Shoe care', 'Stationery',
       'Interior textile', 'Fun', 'Jersey Fancy', 'Accessories', 'Jersey Basic', 'Knitwear',
       'Under-, Nightwear', 'Trousers', 'Blouses', 'Shoes', 'Dresses Ladies',
       'Outdoor', 'Unknown', 'Trousers Denim', 'Swimwear', 'Socks and Tights',
       'Shirts', 'Woven/Jersey/Knitted mix Baby', 'Shorts',
       'Dresses/Skirts girls', 'Skirts', 'Special Offers', 'Dressed']))

45

In [29]:
# we drop the columns which we do not want to keep in our meta data

articles.drop(columns=['product_code', 'prod_name', 'product_type_no', 'product_type_name',
                       'graphical_appearance_no', 'graphical_appearance_name', 'colour_group_code',
                       'colour_group_name','perceived_colour_value_id', 'perceived_colour_value_name',
                       'perceived_colour_master_id', 'perceived_colour_master_name', 'department_no', 
                       'department_name', 'index_code', 'index_name', 'index_group_no', 'section_no', 
                       'section_name', 'garment_group_no', 'detail_desc'], inplace= True)

In [30]:
articles.head(3)

Unnamed: 0,article_id,product_group_name,index_group_name,garment_group_name
0,108775015,Garment Upper body,Ladieswear,Jersey Basic
1,108775044,Garment Upper body,Ladieswear,Jersey Basic
2,108775051,Garment Upper body,Ladieswear,Jersey Basic


In [31]:
# check for nulls 
articles.isnull().sum()

article_id            0
product_group_name    0
index_group_name      0
garment_group_name    0
dtype: int64

In [32]:
# Dummy out the columns to produce binary columns for each 'feature'
articles = pd.get_dummies(articles, columns=['product_group_name', 'index_group_name', 'garment_group_name'])
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 46 columns):
 #   Column                                            Non-Null Count   Dtype
---  ------                                            --------------   -----
 0   article_id                                        105542 non-null  int64
 1   product_group_name_Accessories                    105542 non-null  uint8
 2   product_group_name_Bags                           105542 non-null  uint8
 3   product_group_name_Cosmetic                       105542 non-null  uint8
 4   product_group_name_Fun                            105542 non-null  uint8
 5   product_group_name_Furniture                      105542 non-null  uint8
 6   product_group_name_Garment Full body              105542 non-null  uint8
 7   product_group_name_Garment Lower body             105542 non-null  uint8
 8   product_group_name_Garment Upper body             105542 non-null  uint8
 9   product_group_name_Garment

In [33]:
articles.head(3)

Unnamed: 0,article_id,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Fun,product_group_name_Furniture,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Garment and Shoe care,product_group_name_Interior textile,product_group_name_Items,product_group_name_Nightwear,product_group_name_Shoes,product_group_name_Socks & Tights,product_group_name_Stationery,product_group_name_Swimwear,product_group_name_Underwear,product_group_name_Underwear/nightwear,product_group_name_Unknown,index_group_name_Baby/Children,index_group_name_Divided,index_group_name_Ladieswear,index_group_name_Menswear,index_group_name_Sport,garment_group_name_Accessories,garment_group_name_Blouses,garment_group_name_Dressed,garment_group_name_Dresses Ladies,garment_group_name_Dresses/Skirts girls,garment_group_name_Jersey Basic,garment_group_name_Jersey Fancy,garment_group_name_Knitwear,garment_group_name_Outdoor,garment_group_name_Shirts,garment_group_name_Shoes,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
0,108775015,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,108775044,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,108775051,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


The columns do not need to be scaled as all the columns are binary. The final dataset is ready which can be used for cosine similarity computation and recommendations based off that. I have saved the dataframe in a csv format to be used in the `H&M-ContentBasedFilteringModeling` notebook.  

In [34]:
compression_articles = dict(method='zip', archive_name='Data/out_content.csv')

articles.to_csv('Data/out_content.zip', index=False, compression=compression_articles) 