In [88]:
# %pip install gensim

In [89]:
import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec

# Word Embeddings

In [90]:
df_i = pd.read_pickle('data/cleaned_df_i.pkl')
df_c_train = pd.read_pickle('data/df_c_train.pkl')
df_t_train = pd.read_pickle('data/df_t_train.pkl')
df_c_val = pd.read_pickle('data/df_c_val.pkl')
df_t_val = pd.read_pickle('data/df_t_val.pkl')
df_c_test = pd.read_pickle('data/df_c_test.pkl')
df_t_test = pd.read_pickle('data/df_t_test.pkl')

In [91]:
df_i['structured_desc'] = df_i[['prod_name', 'product_type_name', 'product_group_name', 'perceived_colour_value_name', 'colour_group_name']].apply(lambda x: ', '.join(x), axis=1)

In [92]:
tokenized_descriptions = df_i['structured_desc'].apply(lambda x: x.split())
embedding_dim = 100
word2vec_model = Word2Vec(sentences=tokenized_descriptions, vector_size=embedding_dim, window=3, min_count=1, workers=4)

def generate_item_embedding(description, model):
    word_vectors = [model.wv[word] for word in description if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(embedding_dim)
df_i['embedding'] = tokenized_descriptions.apply(lambda x: generate_item_embedding(x, word2vec_model))

In [93]:
df_i.head()

Unnamed: 0,article_id,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,index_group_name,garment_group_name,detail_desc,structured_desc,embedding
0,108775015,Strap top,253,Vest top,Garment Upper body,Solid,Black,Dark,Black,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top, Vest top, Garment Upper body, Dark,...","[-0.39028147, 0.8257141, 1.0631664, -1.0747285..."
1,108775044,Strap top,253,Vest top,Garment Upper body,Solid,White,Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top, Vest top, Garment Upper body, Light...","[-0.30463585, 0.8029622, 0.94353515, -0.794977..."
2,108775051,Strap top (1),253,Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top (1), Vest top, Garment Upper body, D...","[-0.061743766, 0.8820643, 1.0309086, -0.598122..."
3,110065001,OP T-shirt (Idro),306,Bra,Underwear,Solid,Black,Dark,Black,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","OP T-shirt (Idro), Bra, Underwear, Dark, Black","[0.5408851, 0.32797596, -0.18736076, -0.564628..."
4,110065002,OP T-shirt (Idro),306,Bra,Underwear,Solid,White,Light,White,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","OP T-shirt (Idro), Bra, Underwear, Light, White","[0.65100086, 0.29872352, -0.3411723, -0.204949..."


# Spending Power of Customers

In [94]:
# For Training Set
df_c_train = pd.merge(df_t_train, df_c_train, on='customer_id')
df_c_train = df_c_train.groupby(['customer_id'])['price'].sum().reset_index()
df_c_train.rename(columns={'price': 'total_spent'}, inplace=True)

# For Validation Set
df_c_val = pd.merge(df_t_val, df_c_val, on='customer_id')
df_c_val = df_c_val.groupby(['customer_id'])['price'].sum().reset_index()
df_c_val.rename(columns={'price': 'total_spent'}, inplace=True)

# For Testing Set
df_c_test = pd.merge(df_t_test, df_c_test, on='customer_id')
df_c_test = df_c_test.groupby(['customer_id'])['price'].sum().reset_index()
df_c_test.rename(columns={'price': 'total_spent'}, inplace=True)

In [95]:
quantiles_train = df_c_train['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_val = df_c_val['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_test = df_c_test['total_spent'].quantile([0, 0.25, 0.75, 1.0])

def categorize_spending_power(amount, quantiles):
    if amount <= quantiles[0.25]:
        return "Low"
    elif amount <= quantiles[0.75]:
        return "Medium"
    else:
        return "High"

df_c_train['spending_power'] = df_c_train['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_train))
df_c_val['spending_power'] = df_c_val['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_val))
df_c_test['spending_power'] = df_c_test['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_test))

In [96]:
print(df_c_test['spending_power'].value_counts())
print(df_c_val['spending_power'].value_counts())
print(df_c_train['spending_power'].value_counts())

spending_power
Medium    148271
Low        74241
High       74171
Name: count, dtype: int64
spending_power
Medium    108324
Low        54783
High       54369
Name: count, dtype: int64
spending_power
Medium    234880
Low       117452
High      117441
Name: count, dtype: int64


# Preferred Products

In [97]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_train = pd.merge(df_c_train, preferred_products, on='customer_id', how='left')
df_c_train.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [98]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_val = pd.merge(df_c_val, preferred_products, on='customer_id', how='left')
df_c_val.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [99]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_test = pd.merge(df_c_test, preferred_products, on='customer_id', how='left')
df_c_test.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


# Preferred Color

In [100]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_train = pd.merge(df_c_train, preferred_products, on='customer_id', how='left')
df_c_train.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [101]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_val = pd.merge(df_c_val, preferred_products, on='customer_id', how='left')
df_c_val.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


In [102]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_test = pd.merge(df_c_test, preferred_products, on='customer_id', how='left')
df_c_test.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

  idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']


# Average Timelag between Purchase

In [103]:
# to simulate customers who did not make a repurchase within the time span
# since we're dealing with transactions between 3 months, a time of 12 months would be appropriate
no_repurchase_hours = 8760

In [104]:
df_t_train.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_train['time_diff'] = df_t_train.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_train.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_train = pd.merge(df_c_train, avg_time_diff, on='customer_id', how='left')
df_c_train["hours_between_purchases"] = (df_c_train["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0).round()
df_c_train["hours_between_purchases"] = df_c_train["hours_between_purchases"].replace(0.00, no_repurchase_hours)

In [105]:
df_t_val.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_val['time_diff'] = df_t_val.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_val.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_val = pd.merge(df_c_val, avg_time_diff, on='customer_id', how='left')
df_c_val["hours_between_purchases"] = (df_c_val["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0).round()
df_c_val["hours_between_purchases"] = df_c_val["hours_between_purchases"].replace(0.00, no_repurchase_hours)

In [106]:
df_t_test.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_test['time_diff'] = df_t_test.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_test.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_test = pd.merge(df_c_test, avg_time_diff, on='customer_id', how='left')
df_c_test["hours_between_purchases"] = (df_c_test["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0).round()
df_c_test["hours_between_purchases"] = df_c_test["hours_between_purchases"].replace(0.00, no_repurchase_hours)

In [107]:
df_c_train["hours_between_purchases"] = df_c_train["hours_between_purchases"].fillna(no_repurchase_hours)
df_c_val["hours_between_purchases"] = df_c_val["hours_between_purchases"].fillna(no_repurchase_hours)
df_c_test["hours_between_purchases"] = df_c_test["hours_between_purchases"].fillna(no_repurchase_hours)

In [108]:
df_t_train.to_pickle(os.path.join(os.getcwd(),'data','df_t_train_fe.pkl'))
df_t_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_val_fe.pkl'))
df_t_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_test_fe.pkl'))
df_c_train.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_train_fe.pkl'))
df_c_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_val_fe.pkl'))
df_c_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_test_fe.pkl'))
df_i.to_pickle(os.path.join(os.getcwd(),'data', 'df_i_fe.pkl'))