In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline


from glob import glob
import os 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

import torch
from torch import nn, optim
import timm

import random

#from PIL import image

import albumentations as A
from albumentations.pytorch import ToTensorV2

from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm 

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
train = pd.read_csv('abalone/train.csv')
test = pd.read_csv('abalone/test.csv')
submission = pd.read_csv('abalone/sample_submission.csv')

In [3]:
train

Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.470,0.115,1.1140,0.3925,0.2910,0.3100,15
1,2,I,0.430,0.315,0.095,0.3780,0.1750,0.0800,0.1045,8
2,3,I,0.580,0.490,0.195,1.3165,0.5305,0.2540,0.4100,18
3,4,M,0.535,0.405,0.175,1.2705,0.5480,0.3265,0.3370,13
4,5,I,0.310,0.235,0.090,0.1270,0.0480,0.0310,0.0400,6
...,...,...,...,...,...,...,...,...,...,...
1248,1249,I,0.190,0.145,0.040,0.0380,0.0165,0.0065,0.0150,4
1249,1250,I,0.395,0.310,0.085,0.3170,0.1530,0.0505,0.0935,7
1250,1251,F,0.525,0.410,0.115,0.7745,0.4160,0.1630,0.1800,7
1251,1252,F,0.445,0.335,0.110,0.4355,0.2025,0.1095,0.1195,6


In [4]:
def data_preprocess(df=None, test=None) :
    df = df.copy()
    
  
    # 부위별 무게 / 전체무게
    df['Shell Percent'] = df['Shell Weight']/df['Whole Weight']
    df['Viscra Percent'] = df['Viscra Weight']/df['Whole Weight']
    df['Shucked Percent'] = df['Shucked Weight']/df['Whole Weight']
    
    # (전체무게 - 각 부위무게) / 전체무게
    df['differ_weight'] = abs(df['Shell Weight']+df['Shucked Weight']+df['Viscra Weight']-df['Whole Weight'])/df['Whole Weight']
    
    # 길이 비율
    #df['ratio_1'] = df['Height']/df['Diameter']
    #df['ratio_2'] = df['Lenght']/df['Diameter']
    #df['ratio_3'] = df['Lenght']/df['Height']
    
    # 무게 비율
    df['weight ratio_9'] =  (df['Viscra Weight']+df['Shucked Weight'])/df['Whole Weight']
    
    df = pd.get_dummies(df, columns=['Gender'])
    
    if test == True :         
        df.drop(['id'], axis=1, inplace=True)
        return df
    
    else :
        df = df.drop([382,435,762,1078], axis=0).reset_index()
        df.drop(['id', 'index'], axis=1, inplace=True)
        
        
    
    
        return df

In [5]:
df_train = data_preprocess(train)
df_test = data_preprocess(test, True)

In [6]:
df_train

Unnamed: 0,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target,Shell Percent,Viscra Percent,Shucked Percent,differ_weight,weight ratio_9,Gender_F,Gender_I,Gender_M
0,0.605,0.470,0.115,1.1140,0.3925,0.2910,0.3100,15,0.278276,0.261221,0.352334,0.108169,0.613555,0,0,1
1,0.430,0.315,0.095,0.3780,0.1750,0.0800,0.1045,8,0.276455,0.211640,0.462963,0.048942,0.674603,0,1,0
2,0.580,0.490,0.195,1.3165,0.5305,0.2540,0.4100,18,0.311432,0.192936,0.402962,0.092670,0.595898,0,1,0
3,0.535,0.405,0.175,1.2705,0.5480,0.3265,0.3370,13,0.265250,0.256985,0.431326,0.046438,0.688312,0,0,1
4,0.310,0.235,0.090,0.1270,0.0480,0.0310,0.0400,6,0.314961,0.244094,0.377953,0.062992,0.622047,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1244,0.190,0.145,0.040,0.0380,0.0165,0.0065,0.0150,4,0.394737,0.171053,0.434211,0.000000,0.605263,0,1,0
1245,0.395,0.310,0.085,0.3170,0.1530,0.0505,0.0935,7,0.294953,0.159306,0.482650,0.063091,0.641956,0,1,0
1246,0.525,0.410,0.115,0.7745,0.4160,0.1630,0.1800,7,0.232408,0.210458,0.537121,0.020013,0.747579,1,0,0
1247,0.445,0.335,0.110,0.4355,0.2025,0.1095,0.1195,6,0.274397,0.251435,0.464983,0.009185,0.716418,1,0,0


In [7]:
len(df_test.columns)

15

# seed, loss func

In [8]:
def seed_everything(seed) :
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

def NMAE(true, pred) : 
    mae = np.mean(np.abs(true-pred))
    score = mae/np.mean(np.abs(true))
    return score

SEED = 42
seed_everything(SEED)

# Global Variable

In [9]:
IMAGE_SIZE = 128
TEST_SIZE = 0.2
N_EPOCH = 1000
LR = 1e-3
BATCH_SIZE = 64
MODEL_NAME = 'densenet121'
NUM_FEATURES = len(df_train.drop('Target', 1).columns)

# Dataset

In [10]:
class train_dataset(Dataset) :
    def __init__(self, train_df, normalize=None) :
        super(train_dataset).__init__()
        self.x = train_df.drop('Target', 1)
        self.y = train_df['Target']
        
        if normalize == True : 
            scaler = StandardScaler()
            self.x = pd.DataFrame(scaler.fit_transform(self.x))
            
        self.x = torch.tensor(self.x.values).float()
        self.y = torch.tensor(self.y).float()
        
    def __len__(self) : 
        return len(self.x)
    
    def __getitem__(self, idx) : 
        x = self.x[idx]
        y = self.y[idx]
        
        return x, y 

class test_dataset(Dataset) :
    def __init__(self, test_df, normalize=None) :
        super(test_dataset).__init__()
        self.x = test_df
        
        if normalize == True : 
            scaler = StandardScaler()
            self.x = pd.DataFrame(scaler.fit_transform(self.x))
            
        self.x = torch.tensor(self.x.values).float()
                
    def __len__(self) : 
        return len(self.x)
    
    def __getitem__(self, idx) : 
        x = self.x[idx]
                
        return x

In [11]:
train_dataset = train_dataset(df_train)
test_dataset = test_dataset(df_test)

# Dataloader, transform

In [12]:
len(df_train)

1249

In [13]:
len(df_test)

2924

In [14]:
# abalone
train_loader = DataLoader(train_dataset, batch_size =BATCH_SIZE, shuffle=True)
test_loader =  DataLoader(test_dataset, batch_size =BATCH_SIZE, shuffle=True)

In [15]:
x = next(iter(test_loader))
x.shape

torch.Size([64, 15])

# Model

In [16]:
# 직접 model 구성

import torch.nn.functional as F

class Net(nn.Module) : 
    def __init__(self, num_features) : 
        super(Net, self).__init__()
        self.fc1 = nn.Linear(num_features, 32)
        self.fc2 = nn.Linear(32, 8)
        self.fc3 = nn.Linear(8, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x) : 
        x = F.relu(self.fc1(x))
        x = self.dropout(F.relu(self.fc2(x)))
        x = F.relu(self.fc3(x))
        
        
        return x 

In [17]:
model = Net(NUM_FEATURES).to(device)
model

Net(
  (fc1): Linear(in_features=15, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [18]:
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = LR)

# Train/Validation

In [None]:
timm.list_models()

**Train**

In [22]:
losses = [] 
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'max', patience = 5, min_lr = 2e-4)

for epoch in range(N_EPOCH) : 
    avg_cost = 0
    
    for data, target in train_loader : 
        data = data.to(device)
        target = target.to(device)
        # target = target[..., None].to(device)
        optimizer.zero_grad()
        hypothesis = model(data) # 모델을 forward pass로 결과 저장 
        cost = criterion(hypothesis, target) # output과 target loss계산
        cost.backward() # backward해서 gradient계산
        optimizer.step() # 모델 파라미터 갱신 
        avg_cost += cost / len(train_loader)  # 평균 loss 계산 
    
    losses.append(avg_cost)
    print(f'[Epoch: {epoch+1:>4}] cost = {avg_cost:>.9}')
    
    scheduler.step(avg_cost)


[Epoch:    1] cost = 108.296806
[Epoch:    2] cost = 108.112595
[Epoch:    3] cost = 108.289467
[Epoch:    4] cost = 107.912224
[Epoch:    5] cost = 108.16983
[Epoch:    6] cost = 107.891693
[Epoch:    7] cost = 108.441383
[Epoch:    8] cost = 108.114052
[Epoch:    9] cost = 108.303406
[Epoch:   10] cost = 108.673302
[Epoch:   11] cost = 108.306351
[Epoch:   12] cost = 107.568771
[Epoch:   13] cost = 108.001045
[Epoch:   14] cost = 108.746689
[Epoch:   15] cost = 107.968018
[Epoch:   16] cost = 108.177917
[Epoch:   17] cost = 107.833717
[Epoch:   18] cost = 107.802887
[Epoch:   19] cost = 108.472206
[Epoch:   20] cost = 107.619415
[Epoch:   21] cost = 107.667854
[Epoch:   22] cost = 108.073692
[Epoch:   23] cost = 107.760315
[Epoch:   24] cost = 107.703079
[Epoch:   25] cost = 108.053879
[Epoch:   26] cost = 107.86161
[Epoch:   27] cost = 108.737885
[Epoch:   28] cost = 107.90123
[Epoch:   29] cost = 108.505974
[Epoch:   30] cost = 108.268188
[Epoch:   31] cost = 108.427437
[Epoch:   3

[Epoch:  258] cost = 107.921036
[Epoch:  259] cost = 108.370926
[Epoch:  260] cost = 108.188927
[Epoch:  261] cost = 108.249817
[Epoch:  262] cost = 108.462662
[Epoch:  263] cost = 107.931328
[Epoch:  264] cost = 107.957001
[Epoch:  265] cost = 108.03186
[Epoch:  266] cost = 108.373871
[Epoch:  267] cost = 107.836647
[Epoch:  268] cost = 108.248367
[Epoch:  269] cost = 108.120667
[Epoch:  270] cost = 108.023796
[Epoch:  271] cost = 108.626328
[Epoch:  272] cost = 108.180847
[Epoch:  273] cost = 108.012779
[Epoch:  274] cost = 107.69574
[Epoch:  275] cost = 108.500832
[Epoch:  276] cost = 108.307823
[Epoch:  277] cost = 108.104523
[Epoch:  278] cost = 108.475876
[Epoch:  279] cost = 108.249832
[Epoch:  280] cost = 108.161034
[Epoch:  281] cost = 107.948204
[Epoch:  282] cost = 108.263039
[Epoch:  283] cost = 107.990044
[Epoch:  284] cost = 108.399559
[Epoch:  285] cost = 108.278442
[Epoch:  286] cost = 107.987099
[Epoch:  287] cost = 108.1493
[Epoch:  288] cost = 108.636604
[Epoch:  289

[Epoch:  518] cost = 108.301949
[Epoch:  519] cost = 108.196259
[Epoch:  520] cost = 108.352577
[Epoch:  521] cost = 107.999573
[Epoch:  522] cost = 107.642166
[Epoch:  523] cost = 107.813171
[Epoch:  524] cost = 108.354782
[Epoch:  525] cost = 108.73494
[Epoch:  526] cost = 108.017181
[Epoch:  527] cost = 108.046539
[Epoch:  528] cost = 108.028191
[Epoch:  529] cost = 108.111122
[Epoch:  530] cost = 108.668167
[Epoch:  531] cost = 107.946747
[Epoch:  532] cost = 109.212715
[Epoch:  533] cost = 108.056816
[Epoch:  534] cost = 108.698257
[Epoch:  535] cost = 107.799957
[Epoch:  536] cost = 108.119194
[Epoch:  537] cost = 108.185982
[Epoch:  538] cost = 108.018661
[Epoch:  539] cost = 108.562477
[Epoch:  540] cost = 108.621201
[Epoch:  541] cost = 107.593727
[Epoch:  542] cost = 107.915916
[Epoch:  543] cost = 107.681786
[Epoch:  544] cost = 108.097191
[Epoch:  545] cost = 107.776466
[Epoch:  546] cost = 108.213135
[Epoch:  547] cost = 108.024521
[Epoch:  548] cost = 108.161041
[Epoch:  

[Epoch:  779] cost = 107.921036
[Epoch:  780] cost = 108.28138
[Epoch:  781] cost = 107.811691
[Epoch:  782] cost = 107.961411
[Epoch:  783] cost = 108.095711
[Epoch:  784] cost = 107.930588
[Epoch:  785] cost = 108.222679
[Epoch:  786] cost = 108.323959
[Epoch:  787] cost = 107.888023
[Epoch:  788] cost = 108.177185
[Epoch:  789] cost = 107.546021
[Epoch:  790] cost = 108.361382
[Epoch:  791] cost = 107.699409
[Epoch:  792] cost = 107.979012
[Epoch:  793] cost = 108.161766
[Epoch:  794] cost = 108.174248
[Epoch:  795] cost = 108.061951
[Epoch:  796] cost = 108.345238
[Epoch:  797] cost = 108.444321
[Epoch:  798] cost = 107.639954
[Epoch:  799] cost = 107.847664
[Epoch:  800] cost = 108.668884
[Epoch:  801] cost = 108.188179
[Epoch:  802] cost = 107.896103
[Epoch:  803] cost = 108.484673
[Epoch:  804] cost = 108.310745
[Epoch:  805] cost = 108.482491
[Epoch:  806] cost = 107.868202
[Epoch:  807] cost = 107.595924
[Epoch:  808] cost = 108.280655
[Epoch:  809] cost = 108.216805
[Epoch:  

In [None]:
plt.plot()

In [None]:
    

model.eval()
with torch.no_grad() : # grad 해제 
    correct = 0
    total = 0

    for data, target in test_loader : 
        data = data.to(device)
        target = target.to(device) 
        # target = target[..., None].to(device)
        out = model(data)

        preds = torch.max(out.data, 1)[1]
        total += len(target) 
        correct += (preds==target).sum().item()

        print('TEST ACCURACY :  ', 100.*correct/total, '%')
            

**Validation**

In [None]:
model.eval()
with torch.no_grad() : # grad 해제 
    correct = 0
    total = 0
    
    for data, target in val_loader : 
        data = data.to(device)
        target = target.to(device) 
        # target = target[..., None].to(device)
        out = model(data)
        
        preds = torch.max(out.data, 1)[1]
        total += len(target) 
        correct += (preds==target).sum().item()
        
        print('TEST ACCURACY :  ', 100.*correct/total, '%')

# Inference

In [None]:
preds = []

model.eval()

with torch.no_grad() : # grad 해제 
    for data in test_loader : 
        data = data.to(device)
        pred = model(data)
        # preds = torch.max(out.data, 1)[1]  # 분류 클래스 반환
        pred = pred.cpu().numpy()
        preds.append(pred)
        
preds = np.concatenate(np.array(preds))

In [None]:
preds[0:30]