In [56]:
# %pip install gensim

In [57]:
import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec

# Word Embeddings

In [58]:
df_i = pd.read_pickle('data/cleaned_df_i.pkl')
df_c_train = pd.read_pickle('data/df_c_train.pkl')
df_t_train = pd.read_pickle('data/df_t_train.pkl')
df_c_val = pd.read_pickle('data/df_c_val.pkl')
df_t_val = pd.read_pickle('data/df_t_val.pkl')
df_c_test = pd.read_pickle('data/df_c_test.pkl')
df_t_test = pd.read_pickle('data/df_t_test.pkl')

In [59]:
df_i['structured_desc'] = df_i[['prod_name', 'product_type_name', 'product_group_name', 'perceived_colour_value_name', 'colour_group_name']].apply(lambda x: ', '.join(x), axis=1)

In [60]:
tokenized_descriptions = df_i['structured_desc'].apply(lambda x: x.split())
embedding_dim = 100
word2vec_model = Word2Vec(sentences=tokenized_descriptions, vector_size=embedding_dim, window=3, min_count=1, workers=4)

def generate_item_embedding(description, model):
    word_vectors = [model.wv[word] for word in description if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(embedding_dim)
df_i['embedding'] = tokenized_descriptions.apply(lambda x: generate_item_embedding(x, word2vec_model))

In [61]:
df_i.head()

Unnamed: 0,article_id,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,index_group_name,garment_group_name,detail_desc,structured_desc,embedding
0,108775015,Strap top,253,Vest top,Garment Upper body,Solid,Black,Dark,Black,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top, Vest top, Garment Upper body, Dark,...","[0.02884368, 0.7666942, 0.7618172, -0.6059066,..."
1,108775044,Strap top,253,Vest top,Garment Upper body,Solid,White,Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top, Vest top, Garment Upper body, Light...","[-0.063403346, 0.56410563, 0.6233379, -0.26709..."
2,108775051,Strap top (1),253,Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Ladieswear,Jersey Basic,Jersey top with narrow shoulder straps.,"Strap top (1), Vest top, Garment Upper body, D...","[0.16358002, 0.5641659, 0.8133727, 0.1517059, ..."
3,110065001,OP T-shirt (Idro),306,Bra,Underwear,Solid,Black,Dark,Black,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","OP T-shirt (Idro), Bra, Underwear, Dark, Black","[0.80604786, 0.44214705, -0.42288643, -0.02863..."
4,110065002,OP T-shirt (Idro),306,Bra,Underwear,Solid,White,Light,White,Ladieswear,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...","OP T-shirt (Idro), Bra, Underwear, Light, White","[0.6874445, 0.18167599, -0.6009313, 0.4069746,..."


# Spending Power of Customers

In [62]:
# For Training Set
df_c_train = pd.merge(df_t_train, df_c_train, on='customer_id')
df_c_train = df_c_train.groupby(['customer_id'])['price'].sum().reset_index()
df_c_train.rename(columns={'price': 'total_spent'}, inplace=True)

# For Validation Set
df_c_val = pd.merge(df_t_val, df_c_val, on='customer_id')
df_c_val = df_c_val.groupby(['customer_id'])['price'].sum().reset_index()
df_c_val.rename(columns={'price': 'total_spent'}, inplace=True)

# For Testing Set
df_c_test = pd.merge(df_t_test, df_c_test, on='customer_id')
df_c_test = df_c_test.groupby(['customer_id'])['price'].sum().reset_index()
df_c_test.rename(columns={'price': 'total_spent'}, inplace=True)

In [63]:
quantiles_train = df_c_train['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_val = df_c_val['total_spent'].quantile([0, 0.25, 0.75, 1.0])
quantiles_test = df_c_test['total_spent'].quantile([0, 0.25, 0.75, 1.0])

def categorize_spending_power(amount, quantiles):
    if amount <= quantiles[0.25]:
        return "Low"
    elif amount <= quantiles[0.75]:
        return "Medium"
    else:
        return "High"

df_c_train['spending_power'] = df_c_train['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_train))
df_c_val['spending_power'] = df_c_val['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_val))
df_c_test['spending_power'] = df_c_test['total_spent'].apply(lambda x: categorize_spending_power(x, quantiles_test))

In [64]:
print(df_c_test['spending_power'].value_counts())
print(df_c_val['spending_power'].value_counts())
print(df_c_train['spending_power'].value_counts())

spending_power
Medium    148271
Low        74241
High       74171
Name: count, dtype: int64
spending_power
Medium    108324
Low        54783
High       54369
Name: count, dtype: int64
spending_power
Medium    234880
Low       117452
High      117441
Name: count, dtype: int64


# Preferred Products

In [65]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(2)
top_products_str = top_products.groupby('customer_id')['product_group_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_train = pd.merge(df_c_train, top_products_str, on='customer_id', how='left')
df_c_train.rename(columns={'product_group_name': 'preferred_products'}, inplace=True)
df_c_train.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.111814,Medium,"Garment Upper body, Garment Full body"
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.782712,High,"Garment Upper body, Garment Lower body"
2,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.060983,Medium,Underwear
3,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0.042339,Low,Swimwear
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,0.772729,High,"Garment Upper body, Garment Lower body"


In [66]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(2)
top_products_str = top_products.groupby('customer_id')['product_group_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_val = pd.merge(df_c_val, top_products_str, on='customer_id', how='left')
df_c_val.rename(columns={'product_group_name': 'preferred_products'}, inplace=True)
df_c_val.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.089763,Medium,Swimwear
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,0.399881,High,"Garment Lower body, Garment Upper body"
2,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0.387898,High,"Underwear, Garment Full body"
3,0000c97821eb48d0e590fd309133f0a6c08f7750f64ccc...,0.106661,Medium,"Underwear, Garment Upper body"
4,0000d6c053fc8f9389d4565051f12402d5774aa4a9d2e5...,0.050814,Medium,Swimwear


In [67]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'product_group_name']).size().reset_index(name='purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(2)
top_products_str = top_products.groupby('customer_id')['product_group_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_test = pd.merge(df_c_test, top_products_str, on='customer_id', how='left')
df_c_test.rename(columns={'product_group_name': 'preferred_products'}, inplace=True)
df_c_test.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.033864,Low,"Garment Lower body, Garment Upper body"
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.316729,High,"Swimwear, Garment Upper body"
2,0000b7a134c3ec0d8842fad1fd4ca28517424c14fc4848...,0.059288,Medium,Garment Full body
3,0000d6c053fc8f9389d4565051f12402d5774aa4a9d2e5...,0.089763,Medium,"Garment Upper body, Garment Full body"
4,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0.240508,High,"Swimwear, Garment Lower body"


# Preferred Color

In [68]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='color_purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'color_purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(2)
top_products_str = top_products.groupby('customer_id')['colour_group_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_train = pd.merge(df_c_train, top_products_str, on='customer_id', how='left')
df_c_train.rename(columns={'colour_group_name': 'preferred_colors'}, inplace=True)
df_c_train.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products,preferred_colors
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.111814,Medium,"Garment Upper body, Garment Full body","Black, Light Pink"
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.782712,High,"Garment Upper body, Garment Lower body","Black, Greenish Khaki"
2,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.060983,Medium,Underwear,Black
3,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0.042339,Low,Swimwear,Dark Green
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,0.772729,High,"Garment Upper body, Garment Lower body","Black, Dark Blue"


In [69]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='color_purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'color_purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(2)
top_products_str = top_products.groupby('customer_id')['colour_group_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_val = pd.merge(df_c_val, top_products_str, on='customer_id', how='left')
df_c_val.rename(columns={'colour_group_name': 'preferred_colors'}, inplace=True)
df_c_val.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products,preferred_colors
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.089763,Medium,Swimwear,"Black, Dark Blue"
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,0.399881,High,"Garment Lower body, Garment Upper body","Black, White"
2,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0.387898,High,"Underwear, Garment Full body","Black, Dark Blue"
3,0000c97821eb48d0e590fd309133f0a6c08f7750f64ccc...,0.106661,Medium,"Underwear, Garment Upper body","Dark Red, Light Orange"
4,0000d6c053fc8f9389d4565051f12402d5774aa4a9d2e5...,0.050814,Medium,Swimwear,"Black, Yellow"


In [70]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'colour_group_name']).size().reset_index(name='color_purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'color_purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(2)
top_products_str = top_products.groupby('customer_id')['colour_group_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_test = pd.merge(df_c_test, top_products_str, on='customer_id', how='left')
df_c_test.rename(columns={'colour_group_name': 'preferred_colors'}, inplace=True)
df_c_test.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products,preferred_colors
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.033864,Low,"Garment Lower body, Garment Upper body",Black
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.316729,High,"Swimwear, Garment Upper body","Dark Blue, Black"
2,0000b7a134c3ec0d8842fad1fd4ca28517424c14fc4848...,0.059288,Medium,Garment Full body,"Dark Blue, Off White"
3,0000d6c053fc8f9389d4565051f12402d5774aa4a9d2e5...,0.089763,Medium,"Garment Upper body, Garment Full body","Black, Greenish Khaki"
4,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0.240508,High,"Swimwear, Garment Lower body","Black, Light Blue"


# Preferred Perceived Color

In [71]:
merged_df = pd.merge(df_t_train, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'perceived_colour_value_name']).size().reset_index(name='perceived_color_purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'perceived_color_purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(1)
top_products_str = top_products.groupby('customer_id')['perceived_colour_value_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_train = pd.merge(df_c_train, top_products_str, on='customer_id', how='left')
df_c_train.rename(columns={'perceived_colour_value_name': 'preferred_perceived_colors'}, inplace=True)
df_c_train.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products,preferred_colors,preferred_perceived_colors
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.111814,Medium,"Garment Upper body, Garment Full body","Black, Light Pink",Dark
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.782712,High,"Garment Upper body, Garment Lower body","Black, Greenish Khaki",Dark
2,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.060983,Medium,Underwear,Black,Dark
3,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0.042339,Low,Swimwear,Dark Green,Medium Dusty
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,0.772729,High,"Garment Upper body, Garment Lower body","Black, Dark Blue",Dark


In [72]:
merged_df = pd.merge(df_t_val, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'perceived_colour_value_name']).size().reset_index(name='perceived_color_purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'perceived_color_purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(1)
top_products_str = top_products.groupby('customer_id')['perceived_colour_value_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_val = pd.merge(df_c_val, top_products_str, on='customer_id', how='left')
df_c_val.rename(columns={'perceived_colour_value_name': 'preferred_perceived_colors'}, inplace=True)
df_c_val.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products,preferred_colors,preferred_perceived_colors
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.089763,Medium,Swimwear,"Black, Dark Blue",Dark
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,0.399881,High,"Garment Lower body, Garment Upper body","Black, White",Dark
2,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,0.387898,High,"Underwear, Garment Full body","Black, Dark Blue",Dark
3,0000c97821eb48d0e590fd309133f0a6c08f7750f64ccc...,0.106661,Medium,"Underwear, Garment Upper body","Dark Red, Light Orange",Bright
4,0000d6c053fc8f9389d4565051f12402d5774aa4a9d2e5...,0.050814,Medium,Swimwear,"Black, Yellow",Dark


In [73]:
merged_df = pd.merge(df_t_test, df_i, on='article_id', how='left')
grouped_df = merged_df.groupby(['customer_id', 'perceived_colour_value_name']).size().reset_index(name='perceived_color_purchase_count')
top_products = grouped_df.sort_values(by=['customer_id', 'perceived_color_purchase_count'], ascending=[True, False]) \
    .groupby('customer_id').head(1)
top_products_str = top_products.groupby('customer_id')['perceived_colour_value_name'].apply(lambda x: ', '.join(x)).reset_index()
df_c_test = pd.merge(df_c_test, top_products_str, on='customer_id', how='left')
df_c_test.rename(columns={'perceived_colour_value_name': 'preferred_perceived_colors'}, inplace=True)
df_c_test.head()

Unnamed: 0,customer_id,total_spent,spending_power,preferred_products,preferred_colors,preferred_perceived_colors
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.033864,Low,"Garment Lower body, Garment Upper body",Black,Dark
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.316729,High,"Swimwear, Garment Upper body","Dark Blue, Black",Dark
2,0000b7a134c3ec0d8842fad1fd4ca28517424c14fc4848...,0.059288,Medium,Garment Full body,"Dark Blue, Off White",Dusty Light
3,0000d6c053fc8f9389d4565051f12402d5774aa4a9d2e5...,0.089763,Medium,"Garment Upper body, Garment Full body","Black, Greenish Khaki",Dark
4,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,0.240508,High,"Swimwear, Garment Lower body","Black, Light Blue",Dark


# Merging Preferences into Embeddings

In [74]:
def combine_columns(row):
    return ', '.join(row)
columns_to_combine = ['preferred_products', 'preferred_colors', 'preferred_perceived_colors']

In [75]:
df_c_train['combined_preferences'] = df_c_train[columns_to_combine].apply(combine_columns, axis=1)
df_c_val['combined_preferences'] = df_c_val[columns_to_combine].apply(combine_columns, axis=1)
df_c_test['combined_preferences'] = df_c_test[columns_to_combine].apply(combine_columns, axis=1)

In [76]:
tokenized_descriptions = df_c_train['combined_preferences'].apply(lambda x: x.split())
embedding_dim = 100
word2vec_model = Word2Vec(sentences=tokenized_descriptions, vector_size=embedding_dim, window=3, min_count=1, workers=4)

def generate_item_embedding(description, model):
    word_vectors = [model.wv[word] for word in description if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(embedding_dim)
df_c_train['combined_preferences_embeddings'] = tokenized_descriptions.apply(lambda x: generate_item_embedding(x, word2vec_model))

In [77]:
tokenized_descriptions = df_c_val['combined_preferences'].apply(lambda x: x.split())
embedding_dim = 100
word2vec_model = Word2Vec(sentences=tokenized_descriptions, vector_size=embedding_dim, window=3, min_count=1, workers=4)
df_c_val['combined_preferences_embeddings'] = tokenized_descriptions.apply(lambda x: generate_item_embedding(x, word2vec_model))

In [78]:
tokenized_descriptions = df_c_test['combined_preferences'].apply(lambda x: x.split())
embedding_dim = 100
word2vec_model = Word2Vec(sentences=tokenized_descriptions, vector_size=embedding_dim, window=3, min_count=1, workers=4)
df_c_test['combined_preferences_embeddings'] = tokenized_descriptions.apply(lambda x: generate_item_embedding(x, word2vec_model))

# Average Timelag between Purchase

In [79]:
# to simulate customers who did not make a repurchase within the time span
# since we're dealing with transactions between 3 months, a time of 12 months would be appropriate
no_repurchase_hours = 8760

In [80]:
df_t_train.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_train['time_diff'] = df_t_train.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_train.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_train = pd.merge(df_c_train, avg_time_diff, on='customer_id', how='left')
df_c_train["hours_between_purchases"] = (df_c_train["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0).round()
df_c_train["hours_between_purchases"] = df_c_train["hours_between_purchases"].replace(0.00, no_repurchase_hours)

In [81]:
df_t_val.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_val['time_diff'] = df_t_val.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_val.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_val = pd.merge(df_c_val, avg_time_diff, on='customer_id', how='left')
df_c_val["hours_between_purchases"] = (df_c_val["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0).round()
df_c_val["hours_between_purchases"] = df_c_val["hours_between_purchases"].replace(0.00, no_repurchase_hours)

In [82]:
df_t_test.sort_values(by=['customer_id', 't_dat'], inplace=True)
df_t_test['time_diff'] = df_t_test.groupby('customer_id')['t_dat'].diff()
avg_time_diff = df_t_test.groupby('customer_id')['time_diff'].mean().reset_index()
avg_time_diff.rename(columns={'time_diff': 'avg_time_diff_btw_purchase'}, inplace=True)
df_c_test = pd.merge(df_c_test, avg_time_diff, on='customer_id', how='left')
df_c_test["hours_between_purchases"] = (df_c_test["avg_time_diff_btw_purchase"].dt.total_seconds() / 3600.0).round()
df_c_test["hours_between_purchases"] = df_c_test["hours_between_purchases"].replace(0.00, no_repurchase_hours)

In [83]:
df_c_train["hours_between_purchases"] = df_c_train["hours_between_purchases"].fillna(no_repurchase_hours)
df_c_val["hours_between_purchases"] = df_c_val["hours_between_purchases"].fillna(no_repurchase_hours)
df_c_test["hours_between_purchases"] = df_c_test["hours_between_purchases"].fillna(no_repurchase_hours)

In [84]:
df_t_train.to_pickle(os.path.join(os.getcwd(),'data','df_t_train_fe.pkl'))
df_t_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_val_fe.pkl'))
df_t_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_t_test_fe.pkl'))
df_c_train.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_train_fe.pkl'))
df_c_val.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_val_fe.pkl'))
df_c_test.to_pickle(os.path.join(os.getcwd(),'data', 'df_c_test_fe.pkl'))
df_i.to_pickle(os.path.join(os.getcwd(),'data', 'df_i_fe.pkl'))