In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

In [None]:
df_trans = pd.read_csv(
    '/kaggle/input/elo-merchant-category-recommendation/historical_transactions.csv', 
    dtype={
        'authorized_flag': 'string',
        'card_id': 'string',
        'category_1': 'string',
        'category_3': 'string',
        'merchant_id': 'string',
        'purchase_date': 'string'
    },
    parse_dates=['purchase_date'])
df_merch_trans = pd.read_csv(
    '/kaggle/input/elo-merchant-category-recommendation/new_merchant_transactions.csv',
    dtype={
        'authorized_flag': 'string',
        'card_id': 'string',
        'category_1': 'string',
        'category_3': 'string',
        'merchant_id': 'string',
        'purchase_date': 'string'
    },
    parse_dates=['purchase_date'])

In [None]:
df_trans = pd.concat([df_trans, df_merch_trans])
del df_merch_trans

In [None]:
df_trans

In [None]:
df_trans['category_3'].fillna('A', inplace=True)
df_trans['category_2'].fillna(1.0, inplace=True)
df_trans['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True)
df_trans.isna().sum()

In [None]:
df_trans['category_1'] = df_trans['category_1'].apply(lambda x: {'Y':1, 'N':0}.get(x))
df_trans['authorized_flag'] = df_trans['authorized_flag'].apply(lambda x: {'Y':1, 'N':0}.get(x))
df_trans['category_3'] = df_trans['category_3'].apply(lambda x: {'A':0, 'B':1, 'C':2, 'D': 3}.get(x))
df_trans['category_2'] = df_trans['category_2'].fillna(df_trans['category_2'].dropna().mean())

In [None]:
df_trans['purchase_year'] = df_trans['purchase_date'].dt.year.astype('int16')
df_trans['purchase_month'] = df_trans['purchase_date'].dt.month.astype('int8')
df_trans['purchase_day'] = df_trans['purchase_date'].dt.day.astype('int8')
df_trans['purchase_weekday'] = df_trans['purchase_date'].dt.weekday.astype('int8')
df_trans['purchase_on_weekend'] = df_trans['purchase_weekday'].apply(lambda x: x > 5).astype('int8')
df_trans['purchase_hour'] = df_trans['purchase_date'].dt.hour.astype('int8')
df_trans['purchase_at_night'] = df_trans['purchase_hour'].apply(lambda x: x>22 or x < 8).astype('int8')

In [None]:
df_trans['installments'].replace(-1, 0, inplace=True)
df_trans['installments'].replace(999, 0, inplace=True)

In [None]:
df_trans.head(1)

In [None]:
df_full = df_trans.groupby('card_id').agg({
    'card_id': ['count'],
    'category_1': ['max', 'min', 'mean'],
    'category_2': ['max', 'min', 'mean'],
    'category_3': ['max', 'min', 'mean'],
    'city_id': ['count'],
    'installments': ['max', 'min', 'count'],
    'merchant_category_id': ['min', 'max'],
    'month_lag': ['min', 'max', 'mean'],
    'purchase_amount': ['min', 'max', 'mean', 'sum'],
    'purchase_date': ['min', 'max'],
    'purchase_year': ['min', 'max'],
    'purchase_month': ['min', 'max'],
    'purchase_day': ['min', 'max'],
    'purchase_weekday': ['mean'],
    'purchase_on_weekend': ['mean'],
    'purchase_hour': ['mean'],
    'purchase_at_night': ['mean'],
    'merchant_id': lambda x: x.value_counts().idxmax()
}).reset_index()
del df_trans

In [None]:
df_full['active_period'] = (df_full['purchase_date']['max'] - df_full['purchase_date']['min']).dt.days

In [None]:
df_full.columns = [x1+'_'+x2 for x1, x2 in df_full.columns]
df_full = df_full.rename(columns = {'card_id_': 'card_id'})
df_full

In [None]:
df_train = pd.read_csv(
    '/kaggle/input/elo-merchant-category-recommendation/train.csv',
    parse_dates=['first_active_month'])
df_test = pd.read_csv(
    '/kaggle/input/elo-merchant-category-recommendation/test.csv',
    parse_dates=['first_active_month'])

In [None]:
df_train = df_train.merge(df_full, on=['card_id'], how='left')
df_test = df_test.merge(df_full, on=['card_id'], how='left')
del df_full

In [None]:
df_test['first_active_month'].fillna(df_test['first_active_month'].mean(), inplace=True)

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
df_train['delay_min'] = (df_train['purchase_date_min'] - df_train['first_active_month']).dt.days
df_train['delay_max'] = (df_train['purchase_date_max'] - df_train['first_active_month']).dt.days
df_test['delay_min'] = (df_test['purchase_date_min'] - df_test['first_active_month']).dt.days
df_test['delay_max'] = (df_test['purchase_date_max'] - df_test['first_active_month']).dt.days
df_train['first_active_year'] = df_train['first_active_month'].dt.year
df_train['first_active_month'] = df_train['first_active_month'].dt.month
df_test['first_active_year'] = df_test['first_active_month'].dt.year
df_test['first_active_month'] = df_test['first_active_month'].dt.month
df_train.drop(columns=['purchase_date_min', 'purchase_date_max'], inplace=True)
df_test.drop(columns=['purchase_date_min', 'purchase_date_max'], inplace=True)

In [None]:
df_merch = pd.read_csv(
    '/kaggle/input/elo-merchant-category-recommendation/merchants.csv')
df_merch

In [None]:
df_merch['category_1'] = df_merch['category_1'].apply(lambda x: {'Y':1, 'N':0}.get(x))
df_merch['category_4'] = df_merch['category_4'].apply(lambda x: {'Y':1, 'N':0}.get(x))
df_merch['most_recent_sales_range'] = df_merch['most_recent_sales_range'].apply(lambda x: {
    'A':0, 'B':1, 'C':2, 'D':3, 'E':4
}.get(x))
df_merch['most_recent_purchases_range'] = df_merch['most_recent_purchases_range'].apply(lambda x: {
    'A':0, 'B':1, 'C':2, 'D':3, 'E':4
}.get(x))
for column in df_merch.columns:
    if df_merch[column].dtype == 'float64':
        if df_merch[column].isna().sum() > 0:
            df_merch[column] = df_merch[column].fillna(df_merch[column].mean())
df_merch = df_merch.drop_duplicates(subset='merchant_id')

In [None]:
df_train.rename(columns={'merchant_id_<lambda>': 'merchant_id'}, inplace=True)
df_test.rename(columns={'merchant_id_<lambda>': 'merchant_id'}, inplace=True)

In [None]:
df_train = df_train.merge(df_merch.drop(columns=['merchant_group_id', 'merchant_category_id', 'subsector_id', 'city_id', 'state_id']),
               on=['merchant_id'], 
               how='left')
df_test = df_test.merge(df_merch.drop(columns=['merchant_group_id', 'merchant_category_id', 'subsector_id', 'city_id', 'state_id']),
               on=['merchant_id'], 
               how='left')
df_train.drop(columns='merchant_id', inplace=True)
df_test.drop(columns='merchant_id', inplace=True)

In [None]:
df_train.head(1)

In [None]:
unnecessary_cols = ['category_1_max',
                   'category_1_min',
                   'category_2_min',
                   'category_2_max',
                   'category_3_max',
                   'category_3_min',
                   'city_id_count',
                   'installments_max',
                   'installments_min',
                   'merchant_category_id_min',
                   'merchant_category_id_max',
                   'month_lag_min',
                   'month_lag_max',
                   'month_lag_mean',
                   'purchase_year_min',
                   'purchase_year_max',
                   'purchase_month_min',
                   'purchase_month_max',
                   'purchase_day_min',
                   'purchase_day_max',
                   'purchase_weekday_mean',
                   'purchase_hour_mean',
                   'first_active_year',
                   'first_active_month',
                   'avg_purchases_lag6',
                   #'avg_purchases_lag12',
                   'avg_purchases_lag3',
                   'avg_sales_lag3',
                   'active_months_lag6',
                   #'active_months_lag12',
                   'most_recent_purchases_range',
                   'most_recent_sales_range',
                   #'card_id_count',
                   'installments_count',
                   'delay_min',
                   'active_period_',
                   'active_months_lag3',
                   'avg_sales_lag6',
                   #'avg_sales_lag12',
                   #'purchase_amount_min',
                   #'purchase_amount_max',
                   #'purchase_amount_mean',
                   'purchase_on_weekend_mean',
                   'purchase_at_night_mean',
                   #'delay_max',
                   #'category_1_mean',
                   #'category_2_mean',
                   #'category_3_mean',
                   #'category_2',
                   #'category_1',
                   #'category_4',
                   #'numerical_2',
                   #'numerical_1'
                   ]
df_train_squeezed = df_train.drop(columns=unnecessary_cols)
df_test_squeezed = df_test.drop(columns=unnecessary_cols)

In [None]:
df_train_squeezed

In [None]:
x_train = df_train_squeezed.drop(columns=['card_id', 'target']).to_numpy()
y_train = df_train_squeezed['target'].to_numpy().reshape(-1, 1)
x_test = df_test_squeezed.drop(columns=['card_id']).to_numpy()

import torch 
import torch.nn as nn
x_train = torch.tensor(x_train).float()
y_train = torch.tensor(y_train).float()
x_test = torch.tensor(x_test).float()

In [None]:
import copy
import numpy as np
def train(model, optimizer, loss, x_train, y_train, x_val, y_val, n_epochs=30):
    device = ('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    train_loss_history = []
    val_loss_history = []
    best_params = None
    batch_size = 1000
    for epoch in range(1, n_epochs+1):
        model.train()
        order_train = np.random.permutation(x_train.size(0))
        running_loss_train = 0
        iters = 0
        for start_ind in range(0, x_train.size(0), batch_size):
            iters += 1
            indexes = order_train[start_ind:start_ind+batch_size]
            x_train_batch = x_train[indexes].to(device)
            y_train_batch = y_train[indexes].to(device)
            optimizer.zero_grad()
            y_train_batch_pred = model.forward(x_train_batch)
            loss_value_train = loss(y_train_batch_pred, y_train_batch)
            running_loss_train += loss_value_train.data.cpu()
            loss_value_train.backward()
            optimizer.step()
        train_loss = running_loss_train/iters
        train_loss_history.append(train_loss)
        
        model.eval()
        y_val_preds = net.forward(x_val.to(device))
        val_loss = loss(y_val_preds.to(device), y_val.to(device))
        val_loss_history.append(val_loss.data.cpu())
        if val_loss <= min(val_loss_history):
            best_params = copy.deepcopy(model.parameters)
        if epoch%1 == 0:
            print(f'epoch: {epoch:4d}; train loss: {train_loss:.4f}; val loss: {val_loss:.4f}')
    return train_loss_history, val_loss_history, best_params

In [None]:
class Net(nn.Module):
    def __init__(self, n):
        super().__init__()
        self.bn1 = nn.BatchNorm1d(n)
        self.fc1 = nn.Linear(n, 100)
        self.act1 = nn.Sigmoid()
        self.fc2 = nn.Linear(100, 150)
        self.act2 = nn.Sigmoid()
        self.fc3 = nn.Linear(150, 75)
        self.act3 = nn.Sigmoid()
        self.fc4 = nn.Linear(75,1)
    def forward(self, x):
        x = self.bn1(x)
        x = self.fc1(x)
        x = self.act1(x)
        x = self.fc2(x)
        x = self.act2(x)
        x = self.fc3(x)
        x = self.act3(x)
        x = self.fc4(x)
        return x
net = Net(x_train.shape[1])
optimizer = torch.optim.Adam(params=net.parameters(), lr=1e-3)
loss = torch.nn.MSELoss()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1)

In [None]:
train_loss, val_loss, params = train(net, optimizer, loss, x_train, y_train, x_val, y_val, n_epochs = 100)

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_loss, c='b', label='train')
plt.plot(val_loss, c='r', label='val')
plt.grid()
plt.legend()
plt.title('loss history')

In [None]:
net.parameters = copy.deepcopy(params)

In [None]:
device = ('cuda:0' if torch.cuda.is_available() else 'cpu')
answer = pd.DataFrame(df_test['card_id'])
y_test = net.to(device).forward(x_test.to(device)).data.cpu()
answer['target'] = y_test

In [None]:
answer.to_csv('/kaggle/working/submission.csv', index=False)