In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score

In [2]:
from google.colab import drive 
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [3]:
ROOT_DIR = '/content/gdrive/MyDrive/Dacon/CreditCard/'

df_train = pd.read_csv(f'{ROOT_DIR}/train.csv')
df_valid = pd.read_csv(f'{ROOT_DIR}/val.csv')

print(df_train.shape)
print(df_valid.shape)

(113842, 31)
(28462, 32)


In [4]:
df_train.head(2)

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,3,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972
1,4,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972


In [5]:
df_valid.head(2)

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,V29,V30,Class
0,10,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,-0.255991,-0.994878,0
1,22,0.962496,0.328461,-0.171479,2.109204,1.129566,1.696038,0.107712,0.521502,-1.191311,...,0.402492,-0.048508,-1.371866,0.390814,0.199964,0.016371,-0.014605,0.168937,-0.994784,0


In [6]:
df_train.drop(columns=['ID'], inplace=True)
df_valid.drop(columns=['ID'], inplace=True)

print(df_train.shape)
print(df_valid.shape)

(113842, 30)
(28462, 31)


In [7]:
df_train.head(2)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972
1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972


In [8]:
def fix_seeds(seed=1234):
    """ fix the seed for reproducibility """
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed) 
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [9]:
SEED = 7777
BATCH_SIZE = 2 ** 14
EPOCHS = 300
LR = 0.01

In [10]:
fix_seeds(SEED)

In [11]:
class CreditCardDataset(Dataset):
    def __init__(self, df, mode):
        self.df = df
        self.label = None
        self.mode = mode

        if self.mode == 'valid':
            self.label = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values

    def __getitem__(self, index):
        if self.mode == 'valid':
            X = self.df[index]
            y = self.label[index]
            return torch.Tensor(X), y
        else:
            X = self.df[index]
            return torch.Tensor(X)
        
    def __len__(self):
        return len(self.df)

In [34]:
class AutoEncoder(nn.Module):
    def __init__(self, n_features):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.BatchNorm1d(64),
            nn.ELU(),

            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.ELU(),

            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ELU(),
        )

        self.decoder = nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ELU(),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ELU(),

            nn.Linear(64, n_features),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [13]:
train_dataset = CreditCardDataset(df=df_train, mode='train')
valid_dataset = CreditCardDataset(df=df_valid, mode='valid')

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=BATCH_SIZE)

In [14]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [15]:
print(df_train.shape)
n_features = df_train.shape[-1]

(113842, 30)


In [24]:
def evaluate(model, device, valid_loader, criterion):
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    preds, trues = [], []
    with torch.no_grad():
        for X, y in valid_loader:
            X = X.float().to(DEVICE)
            X_out = model(X)
            sim = cos(X, X_out).cpu().tolist()
            pred = np.where(np.array(sim) < 0.95, 1, 0).tolist()
            preds += pred
            trues += y.tolist()

    return f1_score(preds, trues, average='macro')

In [35]:
fix_seeds(SEED)
model = AutoEncoder(n_features).to(DEVICE)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

model.train()
train_loss = []
best_score = 0
best_model = None

for epoch in range(EPOCHS):
    for X in train_loader:
        X = X.float().to(DEVICE)
        optimizer.zero_grad()
        X_out = model(X)
        loss = criterion(X, X_out)
        
        train_loss.append(loss.item())
        loss.backward()
        optimizer.step()

        cur_score = evaluate(model, DEVICE, valid_loader, criterion)
        #scheduler.step(cur_score)
        if cur_score > best_score:
            best_score = cur_score
            best_model = model

    if epoch % 30 == 0:
        print(epoch, train_loss[-1], cur_score)

0 0.4516807794570923 0.004386628052415075
30 0.05864838883280754 0.5305413765358314
60 0.049982521682977676 0.5488671189923648
90 0.04407784342765808 0.8202665410912253
120 0.03874872624874115 0.9165787375726882
150 0.03371671959757805 0.9165787375726882
180 0.03146582841873169 0.9165787375726882
210 0.032357390969991684 0.9165787375726882
240 0.0283660888671875 0.9165787375726882
270 0.03306693956255913 0.9165787375726882


In [36]:
print(best_model)

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=30, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ELU(alpha=1.0)
    (3): Linear(in_features=64, out_features=128, bias=True)
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ELU(alpha=1.0)
    (6): Linear(in_features=128, out_features=256, bias=True)
    (7): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ELU(alpha=1.0)
  )
  (decoder): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ELU(alpha=1.0)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ELU(alpha=1.0)
    (6): Linear(in_features=64, out_features=30, bias=True)

In [37]:
print(evaluate(best_model, DEVICE, valid_loader, criterion))

0.9165787375726882


In [38]:
df_test = pd.read_csv(f'{ROOT_DIR}/test.csv')
df_submit = pd.read_csv(f'{ROOT_DIR}/sample_submission.csv')

df_test.head()

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,AAAA0x1,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.783274,-0.994983
1,AAAA0x2,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.269825,-0.994983
2,AAAA0x5,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.670579,-0.99496
3,AAAA0x7,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,-0.237686,-0.994937
4,AAAA0xc,0.384978,0.616109,-0.8743,-0.094019,2.924584,3.317027,0.470455,0.538247,-0.558895,...,0.049924,0.238422,0.00913,0.99671,-0.767315,-0.492208,0.042472,-0.054337,-0.167819,-0.994866


In [39]:
df_test.drop(columns=['ID'], inplace=True)
df_test.head(2)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.783274,-0.994983
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.269825,-0.994983


In [46]:
test_dataset = CreditCardDataset(df=df_test, mode='test')
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [47]:
results = []
cos = nn.CosineSimilarity(dim=1, eps=1e-6)

model.eval()
with torch.no_grad():
    for X in test_loader:
        X = X.float().to(DEVICE)
        X_out = model(X)
        sim = cos(X, X_out).cpu().tolist()
        pred = np.where(np.array(sim) < 0.95, 1, 0).tolist()
        results += pred

print(len(results))
print(df_test.shape)

142503
(142503, 30)


In [48]:
df_submit['Class'] = results
df_submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [49]:
df_submit.to_csv(f'{ROOT_DIR}/submit_0730.csv', index=False)