In [11]:
######### TCN

# Install necessary libraries
!pip install numpy pandas scikit-learn matplotlib tensorflow

# Import libraries
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, InputLayer, Flatten, LayerNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping





In [12]:

# Define a TCN layer in TensorFlow
class TCNLayer(tf.keras.layers.Layer):
    def __init__(self, nb_filters, kernel_size, dilations):
        super(TCNLayer, self).__init__()
        self.convs = [Conv1D(nb_filters, kernel_size, dilation_rate=d, padding='causal') for d in dilations]
        self.layernorms = [LayerNormalization() for _ in dilations]

    def call(self, inputs):
        x = inputs
        for conv, norm in zip(self.convs, self.layernorms):
            x = conv(x)
            x = tf.keras.activations.relu(x)
            x = norm(x)
        return x



In [13]:

# Read product catalog data
data = pd.read_csv("product_catalog.csv")
data['categories'] = data['categories'].fillna('[]')
data['categories'] = data['categories'].apply(literal_eval)  # Convert string to list

# Encode categories
mlb = MultiLabelBinarizer()
categories_encoded = mlb.fit_transform(data['categories'])

# Normalize numerical features
attributes = data[['attribute_1', 'attribute_2', 'attribute_3', 'attribute_4', 'attribute_5']].fillna(0)
scaler = StandardScaler()
attributes_normalized = scaler.fit_transform(attributes)

# Combine features
features = np.hstack([attributes_normalized, categories_encoded])

# Apply clustering
kmeans = KMeans(n_clusters=1000, random_state=42)
data['cluster'] = kmeans.fit_predict(features)

In [14]:

# Read transactions data
transactions = pd.read_csv("transactions.csv")
transactions['purchase_date'] = pd.to_datetime(transactions['purchase_date'])

# Count the number of transactions for each customer
transaction_counts = transactions.groupby('customer_id')['purchase_date'].count().reset_index(name='transaction_count')
max_transactions = transaction_counts['transaction_count'].max()
bins = np.linspace(0, max_transactions, 11)  # Creates 10 bins
labels = range(1, 11)  # Categories from 1 to 10
customer_categories = pd.cut(transaction_counts['transaction_count'], bins=bins, labels=labels, include_lowest=True)
transaction_counts['shopping_tendency'] = customer_categories

transactions = pd.merge(transactions, transaction_counts[['customer_id', 'shopping_tendency']], on='customer_id', how='left')
transactions = transactions.sort_values(by=['customer_id', 'product_id', 'purchase_date'])

transactions

Unnamed: 0,customer_id,product_id,purchase_date,quantity,shopping_tendency
390966,0,1505,2020-09-01,1,1
819667,0,6446,2020-12-11,1,1
978365,0,6446,2021-01-15,1,1
709719,0,9325,2020-11-20,1,1
268197,0,12468,2020-08-03,1,1
...,...,...,...,...,...
995072,46137,22403,2021-01-18,1,1
1069936,46137,22583,2021-01-31,1,1
287156,46137,28343,2020-08-08,1,1
287097,46137,28900,2020-08-08,2,1


In [15]:

transactions['prev_purchase_date'] = transactions.groupby(['customer_id', 'product_id'])['purchase_date'].shift(1)
transactions['days_since_last_purchase'] = (transactions['purchase_date'] - transactions['prev_purchase_date']).dt.days.fillna(0).astype(int)
transactions['last_quantity'] = transactions.groupby(['customer_id', 'product_id'])['quantity'].shift(1, fill_value=0).astype(int)

product_popularity = transactions.groupby('product_id')['customer_id'].count().reset_index(name='product_popularity')
transactions = pd.merge(transactions, product_popularity[['product_id', 'product_popularity']], on='product_id', how='left')

first_purchase = transactions.groupby('customer_id')['purchase_date'].min().reset_index()
first_purchase.columns = ['customer_id', 'first_purchase_date']
first_purchase['days_since_first_purchase'] = (pd.to_datetime('today') - first_purchase['first_purchase_date']).dt.days
transactions = pd.merge(transactions, first_purchase[['customer_id', 'days_since_first_purchase']], on='customer_id', how='left')

transactions['rolling_avg_quantity'] = transactions.groupby('customer_id')['quantity'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
transactions['rolling_avg_days_between'] = transactions.groupby('customer_id')['days_since_last_purchase'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)

total_quantity = transactions.groupby('customer_id')['quantity'].sum().reset_index(name='total_quantity')
average_quantity = transactions.groupby('customer_id')['quantity'].mean().reset_index(name='average_quantity')

customer_features = pd.merge(transaction_counts[['customer_id', 'transaction_count']], total_quantity, on='customer_id')
customer_features = pd.merge(customer_features, average_quantity, on='customer_id')

customer_features

Unnamed: 0,customer_id,transaction_count,total_quantity,average_quantity
0,0,18,20,1.111111
1,1,15,18,1.200000
2,2,21,25,1.190476
3,3,40,54,1.350000
4,4,23,39,1.695652
...,...,...,...,...
46133,46133,25,27,1.080000
46134,46134,29,35,1.206897
46135,46135,7,13,1.857143
46136,46136,35,69,1.971429


In [16]:

scaler = StandardScaler()  # Standardize the features
scaled_features = scaler.fit_transform(customer_features[['transaction_count', 'total_quantity', 'average_quantity']])
kmeans = KMeans(n_clusters=1000, random_state=42)
customer_features['cluster_customer'] = kmeans.fit_predict(scaled_features)
transactions = pd.merge(transactions, customer_features[['customer_id', 'cluster_customer']], on='customer_id', how='left')

# Merge transaction data with product catalog to add cluster information
merged_df = pd.merge(transactions, data[['product_id', 'cluster']], on='product_id', how='left')
merged_df = merged_df.sort_values(by=['customer_id', 'cluster', 'purchase_date'])

merged_df['cumulative_quantity'] = merged_df.groupby(['customer_id', 'cluster'])['quantity'].cumsum()
merged_df['last_cumulative_quantity'] = merged_df.groupby(['customer_id', 'cluster'])['cumulative_quantity'].shift(1, fill_value=0).astype(int)

merged_df


Unnamed: 0,customer_id,product_id,purchase_date,quantity,shopping_tendency,prev_purchase_date,days_since_last_purchase,last_quantity,product_popularity,days_since_first_purchase,rolling_avg_quantity,rolling_avg_days_between,cluster_customer,cluster,cumulative_quantity,last_cumulative_quantity
16,0,31683,2020-12-04,1,1,NaT,0,0,1515,1675,1.2,13.2,361,34,1,0
12,0,28231,2020-11-20,1,1,NaT,0,0,41,1675,1.0,37.8,361,41,1,0
13,0,28231,2020-12-11,1,1,2020-11-20,21,1,41,1675,1.0,42.0,361,41,2,1
14,0,28231,2021-01-25,2,1,2020-12-11,45,1,41,1675,1.2,25.8,361,41,4,2
11,0,26387,2020-10-09,1,1,NaT,0,0,453,1675,1.0,37.8,361,71,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1071533,46137,22403,2021-01-18,1,1,NaT,0,0,74,1611,1.0,0.0,5,322,1,0
1071532,46137,20209,2020-08-08,1,1,NaT,0,0,1101,1611,1.0,0.0,5,356,1,0
1071528,46137,2550,2021-01-18,1,1,NaT,0,0,269,1611,1.0,0.0,5,450,1,0
1071530,46137,11338,2021-01-31,1,1,NaT,0,0,95,1611,1.0,0.0,5,491,1,0


In [17]:

train_df = merged_df[['customer_id', 'product_id', 'cluster', 'last_quantity', 'days_since_last_purchase', 'purchase_date',
                      'quantity', 'cumulative_quantity', 'last_cumulative_quantity', 'shopping_tendency',
                      'cluster_customer', 'product_popularity', 'days_since_first_purchase', 'rolling_avg_quantity',
                      'rolling_avg_days_between']]

# Prepare data for TCN model
X_train = train_df[['cluster', 'last_quantity', 'last_cumulative_quantity', 'shopping_tendency', 'cluster_customer',
                    'product_popularity', 'days_since_first_purchase', 'rolling_avg_quantity', 'rolling_avg_days_between']]
y_train = train_df['days_since_last_purchase']

# Reshape data for TCN
X_train = np.expand_dims(X_train.values, axis=1)  # Add a time dimension
y_train = y_train.values

# Define the TCN model
model = Sequential()
model.add(InputLayer(input_shape=(1, X_train.shape[2])))  # 1 is the time step dimension
model.add(TCNLayer(nb_filters=64, kernel_size=3, dilations=[1, 2, 4]))  # Custom TCN layer with adjusted parameters
model.add(Dropout(0.3))  # Add dropout for regularization
model.add(Flatten())  # Flatten the output of the TCN layer
model.add(Dense(1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mse')
model.summary()

# Train the model with EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])




Epoch 1/10
[1m26789/26789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 2ms/step - loss: 548.7889 - val_loss: 284.5249
Epoch 2/10
[1m26789/26789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 2ms/step - loss: 269.5471 - val_loss: 208.2076
Epoch 3/10
[1m26789/26789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2ms/step - loss: 244.5580 - val_loss: 208.2357
Epoch 4/10
[1m26789/26789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 2ms/step - loss: 233.9265 - val_loss: 259.6197
Epoch 5/10
[1m26789/26789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 3ms/step - loss: 231.7672 - val_loss: 244.4582
Epoch 6/10
[1m26789/26789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 2ms/step - loss: 229.0293 - val_loss: 229.0387
Epoch 7/10
[1m26789/26789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2ms/step - loss: 224.6856 - val_loss: 212.2803


<keras.src.callbacks.history.History at 0x7d1ac1c33760>

In [18]:

# Prepare the test data
test = pd.read_csv("test.csv")
test = pd.merge(test, train_df[['customer_id', 'product_id', 'cluster', 'last_quantity',
                                'cumulative_quantity', 'shopping_tendency', 'cluster_customer',
                                'product_popularity', 'days_since_first_purchase', 'rolling_avg_quantity',
                                'rolling_avg_days_between']], on=['customer_id', 'product_id'], how='left')

test = test.rename(columns={'quantity': 'last_quantity', 'cumulative_quantity': 'last_cumulative_quantity'})
X_test = test[['cluster', 'last_quantity', 'last_cumulative_quantity', 'shopping_tendency', 'cluster_customer',
               'product_popularity', 'days_since_first_purchase', 'rolling_avg_quantity', 'rolling_avg_days_between']]

# Reshape test data for TCN
X_test = np.expand_dims(X_test.values, axis=1)

# Predict the number of days since the last purchase
test['prediction'] = model.predict(X_test).clip(min=0).astype(int)

# Merge the test data with the last purchase data to add the purchase date
test_df_merged = test.merge(train_df[['customer_id', 'product_id', 'purchase_date']], on=['customer_id', 'product_id'], how='left')


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [19]:

# Adjust dates based on prediction and purchase date
def adjust_date(row):
    if row['prediction'] != 0:
        while row['predicted_purchase_date'] < pd.to_datetime('2021-02-01'):
            row['predicted_purchase_date'] += pd.to_timedelta(row['prediction'], unit='D')
    return row['predicted_purchase_date']

# Calculate the initial predicted_purchase_date
test_df_merged['predicted_purchase_date'] = test_df_merged['purchase_date'] + pd.to_timedelta(test_df_merged['prediction'], unit='D')

# Apply the adjustment function to each row
test_df_merged['predicted_purchase_date'] = test_df_merged.apply(adjust_date, axis=1)

# Calculate the number of days between the predicted purchase date and February 1st, 2021
feb_1st_2021 = pd.to_datetime('2021-02-01')
test_df_merged['days_from_feb_1st'] = (test_df_merged['predicted_purchase_date'] - feb_1st_2021).dt.days

# Map day values to target values (weeks)
def map_weeks_to_feb(days):
    if days > 29:
        return 0
    elif 0 <= days <= 8:
        return 1
    elif 8 < days <= 15:
        return 2
    elif 15 < days <= 22:
        return 3
    elif 22 < days <= 29:
        return 4
    else:
        return 0

# Apply the mapping function
test_df_merged['prediction'] = test_df_merged['days_from_feb_1st'].apply(map_weeks_to_feb).astype(int)

# Select relevant columns and ensure unique IDs
df_selected = test_df_merged[['id', 'customer_id', 'product_id', 'prediction']]
df_selected = df_selected.drop_duplicates(subset=['id'])

# Save the result to a CSV file
df_selected.to_csv('submission.csv', index=False)

# Output the distribution of predictions
print(df_selected['prediction'].value_counts())

prediction
0    7263
1    2643
2      72
3      16
4       6
Name: count, dtype: int64
