In [47]:
#%pip install gensim

In [3]:
import pandas as pd
import numpy as np
import os
# from gensim.models import Word2Vec

# Word Embeddings

In [5]:
data_dir = os.path.join(os.getcwd(), 'data', 'cleaned_data')
df_i = pd.read_pickle(os.path.join(data_dir, 'cleaned_df_i.pkl'))
df_c_train = pd.read_pickle(os.path.join(data_dir, 'df_c_train.pkl'))
df_t_train = pd.read_pickle(os.path.join(data_dir, 'df_t_train.pkl'))
df_c_val = pd.read_pickle(os.path.join(data_dir, 'df_c_val.pkl'))
df_t_val = pd.read_pickle(os.path.join(data_dir, 'df_t_val.pkl'))
df_c_test = pd.read_pickle(os.path.join(data_dir, 'df_c_test.pkl'))
df_t_test = pd.read_pickle(os.path.join(data_dir, 'df_t_test.pkl'))
joined_df_train = pd.read_pickle(os.path.join(data_dir, 'joined_df_t_c_i_train.pkl'))

In [50]:
df_i['structured_desc'] = df_i[['prod_name', 'product_type_name', 'product_group_name']].apply(lambda x: ' '.join(x), axis=1)

In [51]:
tokenized_descriptions = df_i['structured_desc'].apply(lambda x: x.split())
embedding_dim = 100
word2vec_model = Word2Vec(sentences=tokenized_descriptions, vector_size=embedding_dim, window=3, min_count=1, workers=4)

def generate_item_embedding(description, model):
    word_vectors = [model.wv[word] for word in description if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(embedding_dim)
df_i['embedding'] = tokenized_descriptions.apply(lambda x: generate_item_embedding(x, word2vec_model))

In [52]:
df_i.head()

Unnamed: 0,article_id,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,index_group_name,garment_group_name,detail_desc,structured_desc,embedding
0,108775015,Strap top,253,Vest top,Garment Upper body,Solid,Black,Dark,Black,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,Strap top Vest top Garment Upper body,"[0.16747095, 0.77091706, 0.45015836, 0.7519173..."
1,108775044,Strap top,253,Vest top,Garment Upper body,Solid,White,Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,Strap top Vest top Garment Upper body,"[0.16747095, 0.77091706, 0.45015836, 0.7519173..."
2,108775051,Strap top (1),253,Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,Strap top (1) Vest top Garment Upper body,"[0.06088616, 0.8013817, 0.3435098, 0.6323141, ..."
3,110065001,OP T-shirt (Idro),306,Bra,Underwear,Solid,Black,Dark,Black,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",OP T-shirt (Idro) Bra Underwear,"[-0.36702323, -0.26560035, -0.9847349, 0.39013..."
4,110065002,OP T-shirt (Idro),306,Bra,Underwear,Solid,White,Light,White,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",OP T-shirt (Idro) Bra Underwear,"[-0.36702323, -0.26560035, -0.9847349, 0.39013..."


# Spending Power of Customers

In [53]:
# For Training Set
df_c_train = pd.merge(df_t_train, df_c_train, on='customer_id')
df_c_train = df_c_train.groupby(['customer_id'])['price'].sum().reset_index()
df_c_train.rename(columns={'price': 'total_spent'}, inplace=True)

# For Validation Set
df_c_val = pd.merge(df_t_val, df_c_val, on='customer_id')
df_c_val = df_c_val.groupby(['customer_id'])['price'].sum().reset_index()
df_c_val.rename(columns={'price': 'total_spent'}, inplace=True)

# For Testing Set
df_c_test = pd.merge(df_t_test, df_c_test, on='customer_id')
df_c_test = df_c_test.groupby(['customer_id'])['price'].sum().reset_index()
df_c_test.rename(columns={'price': 'total_spent'}, inplace=True)

In [54]:
quantiles_train = df_c_train['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_val = df_c_val['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_test = df_c_test['total_spent'].quantile([0, 0.25, 0.75, 1.0])

def categorize_spending_power(amount, quantiles):
    if amount <= quantiles[0.25]:
        return 'low'
    elif amount <= quantiles[0.75]:
        return 'medium'
    else:
        return 'high'

df_c_train['spending_power'] = df_c_train['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_train))
df_c_val['spending_power'] = df_c_val['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_val))
df_c_test['spending_power'] = df_c_test['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_test))

In [55]:
print(df_c_test['spending_power'].value_counts())
print(df_c_val['spending_power'].value_counts())
print(df_c_train['spending_power'].value_counts())

spending_power
medium    148271
low        74241
high       74171
Name: count, dtype: int64
spending_power
medium    108324
low        54783
high       54369
Name: count, dtype: int64
spending_power
medium    234880
low       117452
high      117441
Name: count, dtype: int64


# Preferred Products

In [56]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_train = pd.merge(df_c_train, preferred_products, on='customer_id', how='left')
df_c_train.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

In [57]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_val = pd.merge(df_c_val, preferred_products, on='customer_id', how='left')
df_c_val.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

In [58]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'product_group_name']]
df_c_test = pd.merge(df_c_test, preferred_products, on='customer_id', how='left')
df_c_test.rename(columns={'product_group_name': 'preferred_prod'}, inplace=True)

In [61]:
df_c_test.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_prod
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.033864,low,Garment Lower body
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.033864,low,Garment Upper body
2,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.316729,high,Swimwear
3,0000b7a134c3ec0d8842fad1fd4ca28517424c14fc4848...,0.059288,medium,Garment Full body
4,0000d6c053fc8f9389d4565051f12402d5774aa4a9d2e5...,0.089763,medium,Garment Upper body


# Preferred Color

In [62]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_train = pd.merge(df_c_train, preferred_products, on='customer_id', how='left')
df_c_train.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

In [64]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_val = pd.merge(df_c_val, preferred_products, on='customer_id', how='left')
df_c_val.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

In [65]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='purchase_count')
idx = grouped_df.groupby(['customer_id'])['purchase_count'].transform(max) == grouped_df['purchase_count']
preferred_products = grouped_df[idx][['customer_id', 'colour_group_name']]
df_c_test = pd.merge(df_c_test, preferred_products, on='customer_id', how='left')
df_c_test.rename(columns={'colour_group_name': 'preferred_color'}, inplace=True)

# Average Timelag between Purchase

In [70]:
df_t_train.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_train['time_diff'] = df_t_train.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_train.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_train = pd.merge(df_c_train, avg_time_diff, on='customer_id', how='left')

In [73]:
df_t_val.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_val['time_diff'] = df_t_val.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_val.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_val = pd.merge(df_c_val, avg_time_diff, on='customer_id', how='left')

In [75]:
df_t_test.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_test['time_diff'] = df_t_test.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_test.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_test = pd.merge(df_c_test, avg_time_diff, on='customer_id', how='left')

In [77]:
df_t_train.to_pickle(os.path.join(os.getcwd(),'data','df_t_train_fe.pkl'))
df_t_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_val_fe.pkl'))
df_t_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_test_fe.pkl'))
df_c_train.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_train_fe.pkl'))
df_c_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_val_fe.pkl'))
df_c_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_test_fe.pkl'))
df_i.to_pickle(os.path.join(os.getcwd(),'data', 'df_i_fe.pkl'))

# Style Dictionary

In [9]:
#imports
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from string import punctuation

In [10]:
all_tokens = []
for description in joined_df_train['detail_desc']:
    tokens = word_tokenize(description)
    all_tokens.extend(tokens)

common_words = set(stopwords.words('english') + list(punctuation) + ['the', 'and', 'is', 'are', 'of', 'in', 'on', 'with', 'for', 'to', 'at', 'from', 'as', 'by', 'or'])

filtered_tokens = [word for word in all_tokens if word.lower() not in common_words]

word_counts = Counter(filtered_tokens)

style_dictionary = dict(word_counts.most_common(10))

print("Fashion Style Dictionary:")
print(style_dictionary)

Fashion Style Dictionary:
{'back': 1690196, 'waist': 1422570, 'top': 1131107, 'front': 1087774, 'jersey': 936588, 'sleeves': 874074, 'pockets': 764195, 'hem': 734945, 'straps': 695055, 'weave': 677069}


In [12]:
#convert to list
style_list = list(style_dictionary.keys())
print(style_list)

['back', 'waist', 'top', 'front', 'jersey', 'sleeves', 'pockets', 'hem', 'straps', 'weave']
