In [138]:
#%pip install gensim

In [139]:
import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec

# Word Embeddings

In [140]:
df_i = pd.read_pickle('data/cleaned_df_i.pkl')
df_c_train = pd.read_pickle('data/df_c_train.pkl')
df_t_train = pd.read_pickle('data/df_t_train.pkl')
df_c_val = pd.read_pickle('data/df_c_val.pkl')
df_t_val = pd.read_pickle('data/df_t_val.pkl')
df_c_test = pd.read_pickle('data/df_c_test.pkl')
df_t_test = pd.read_pickle('data/df_t_test.pkl')

In [141]:
df_i['structured_desc'] = df_i[['prod_name', 'product_type_name', 'product_group_name', 'perceived_colour_value_name', 'colour_group_name']].apply(lambda x: ', '.join(x), axis=1)

In [142]:
tokenized_descriptions = df_i['structured_desc'].apply(lambda x: x.split())
embedding_dim = 100
word2vec_model = Word2Vec(sentences=tokenized_descriptions, vector_size=embedding_dim, window=3, min_count=1, workers=4)

def generate_item_embedding(description, model):
    word_vectors = [model.wv[word] for word in description if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(embedding_dim)
df_i['embedding'] = tokenized_descriptions.apply(lambda x: generate_item_embedding(x, word2vec_model))

In [143]:
df_i.head()

Unnamed: 0,article_id,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,index_group_name,garment_group_name,detail_desc,structured_desc,embedding
0,108775015,Strap top,253,Vest top,Garment Upper body,Solid,Black,Dark,Black,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top, Vest top, Garment Upper body, Dark,...","[0.1574208, 1.0581949, 0.79257476, -0.23219684..."
1,108775044,Strap top,253,Vest top,Garment Upper body,Solid,White,Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top, Vest top, Garment Upper body, Light...","[0.108548336, 0.95319206, 0.42330232, 0.125976..."
2,108775051,Strap top (1),253,Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top (1), Vest top, Garment Upper body, D...","[0.110866465, 1.2246294, 0.678072, 0.542589, 0..."
3,110065001,OP T-shirt (Idro),306,Bra,Underwear,Solid,Black,Dark,Black,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","OP T-shirt (Idro), Bra, Underwear, Dark, Black","[0.8051066, 0.74185973, -0.34737697, -0.146875..."
4,110065002,OP T-shirt (Idro),306,Bra,Underwear,Solid,White,Light,White,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","OP T-shirt (Idro), Bra, Underwear, Light, White","[0.7422705, 0.60685605, -0.8221559, 0.31363314..."


# Spending Power of Customers

In [144]:
# For Training Set
df_c_train = pd.merge(df_t_train, df_c_train, on='customer_id')
df_c_train = df_c_train.groupby(['customer_id'])['price'].sum().reset_index()
df_c_train.rename(columns={'price': 'total_spent'}, inplace=True)

# For Validation Set
df_c_val = pd.merge(df_t_val, df_c_val, on='customer_id')
df_c_val = df_c_val.groupby(['customer_id'])['price'].sum().reset_index()
df_c_val.rename(columns={'price': 'total_spent'}, inplace=True)

# For Testing Set
df_c_test = pd.merge(df_t_test, df_c_test, on='customer_id')
df_c_test = df_c_test.groupby(['customer_id'])['price'].sum().reset_index()
df_c_test.rename(columns={'price': 'total_spent'}, inplace=True)

In [145]:
quantiles_train = df_c_train['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_val = df_c_val['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_test = df_c_test['total_spent'].quantile([0, 0.25, 0.75, 1.0])

def categorize_spending_power(amount, quantiles):
    if amount <= quantiles[0.25]:
        return '0'
    elif amount <= quantiles[0.75]:
        return '1'
    else:
        return '2'

df_c_train['spending_power'] = df_c_train['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_train))
df_c_val['spending_power'] = df_c_val['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_val))
df_c_test['spending_power'] = df_c_test['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_test))

In [146]:
print(df_c_test['spending_power'].value_counts())
print(df_c_val['spending_power'].value_counts())
print(df_c_train['spending_power'].value_counts())

spending_power
1    148271
0     74241
2     74171
Name: count, dtype: int64
spending_power
1    108324
0     54783
2     54369
Name: count, dtype: int64
spending_power
1    234880
0    117452
2    117441
Name: count, dtype: int64


# Preferred Products

In [147]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_train = pd.merge(df_c_train, preferred_products, on='customer_id', how='left')
df_c_train.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [148]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_val = pd.merge(df_c_val, preferred_products, on='customer_id', how='left')
df_c_val.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [149]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_test = pd.merge(df_c_test, preferred_products, on='customer_id', how='left')
df_c_test.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


# Preferred Color

In [150]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_train = pd.merge(df_c_train, preferred_products, on='customer_id', how='left')
df_c_train.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [151]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_val = pd.merge(df_c_val, preferred_products, on='customer_id', how='left')
df_c_val.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [152]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_test = pd.merge(df_c_test, preferred_products, on='customer_id', how='left')
df_c_test.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


# Preferred Embedding
- uses the features generated via Word2Vec and its associated column "embedding" to merge df_c_** and df_i

# Average Timelag between Purchase

In [None]:
df_t_train.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_train['time_diff'] = df_t_train.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_train.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_train = pd.merge(df_c_train, avg_time_diff, on='customer_id', how='left')
df_c_train["hours_between_purchases"] = df_c_train["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0

In [72]:
df_t_val.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_val['time_diff'] = df_t_val.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_val.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_val = pd.merge(df_c_val, avg_time_diff, on='customer_id', how='left')
df_c_val["hours_between_purchases"] = df_c_val["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0

In [73]:
df_t_test.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_test['time_diff'] = df_t_test.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_test.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_test = pd.merge(df_c_test, avg_time_diff, on='customer_id', how='left')
df_c_test["hours_between_purchases"] = df_c_test["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0

In [157]:
df_c_train

Unnamed: 0,customer_id,total_spent,spending_power,preferred_prod,preferred_color,avg_time_diff_btw_purchase,hours_between_purchases
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.111814,1,Garment Upper body,Black,11 days 12:00:00,276.000000
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.782712,2,Garment Upper body,Black,3 days 10:06:18.947368421,82.105263
2,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.060983,1,Underwear,Black,0 days 00:00:00,0.000000
3,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0.042339,0,Swimwear,Dark Green,0 days 00:00:00,0.000000
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,0.772729,2,Garment Upper body,Black,1 days 21:31:02.068965517,45.517241
...,...,...,...,...,...,...,...
858859,ffff61677073258d461e043cc9ed4ed97be5617a920640...,0.088051,1,Garment Upper body,Grey,2 days 06:00:00,54.000000
858860,ffff61677073258d461e043cc9ed4ed97be5617a920640...,0.088051,1,Garment Upper body,Light Blue,2 days 06:00:00,54.000000
858861,ffff61677073258d461e043cc9ed4ed97be5617a920640...,0.088051,1,Garment Upper body,Off White,2 days 06:00:00,54.000000
858862,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0.657237,2,Swimwear,Dark Blue,2 days 09:49:05.454545454,57.818182


In [74]:
df_t_train.to_pickle(os.path.join(os.getcwd(),'data','df_t_train_fe.pkl'))
df_t_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_val_fe.pkl'))
df_t_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_test_fe.pkl'))
df_c_train.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_train_fe.pkl'))
df_c_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_val_fe.pkl'))
df_c_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_test_fe.pkl'))
df_i.to_pickle(os.path.join(os.getcwd(),'data', 'df_i_fe.pkl'))