In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import pandas as pd
from collections import Counter
import seaborn as sns
import numpy 
import scipy

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer,f1_score
from sklearn.linear_model import LogisticRegression
from sys import getsizeof

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

from sklearn.model_selection import train_test_split
from rfpimp import *

scorer = make_scorer(f1_score, needs_proba=False,
                     greater_is_better=True,average='micro')

In [3]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F


In [4]:
df = pd.read_csv('../input/train_values.csv')
train_target = pd.read_csv('../input/train_labels.csv')

print(Counter(train_target.damage_grade))

y = train_target.damage_grade

Counter({2: 148259, 3: 87218, 1: 25124})


In [5]:
def create_X(df):
    X = df.copy()
    
#     X['disc'] = X['ground_floor_type'].astype(str) + X['other_floor_type'].astype(str)  # new feature
    
    cat_cols = X.columns[X.dtypes == 'object']
    id_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
    cat_cols = id_cols + cat_cols.tolist()
    num_cols = [i for i in X.columns if i not in cat_cols]
    
    x1 = X[cat_cols].apply(LabelEncoder().fit_transform)
    
    X = pd.concat([X[num_cols], x1], axis=1)
    
    return X, num_cols, cat_cols



In [6]:


class TabularDataset(Dataset):
    
    def __init__(self, data, cat_cols=None, output_col=None):
        self.n = data.shape[0]

        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y =  np.zeros((self.n, 1))

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = [col for col in data.columns
                          if col not in self.cat_cols + [output_col]]

        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[cat_cols].astype(np.int64).values
        else:
            self.cat_X =  np.zeros((self.n, 1))

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [7]:


class FeedForwardNN(nn.Module):

  def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
               output_size, emb_dropout, lin_layer_dropouts):

    """
    Parameters
    ----------

    emb_dims: List of two element tuples
      This list will contain a two element tuple for each
      categorical feature. The first element of a tuple will
      denote the number of unique values of the categorical
      feature. The second element will denote the embedding
      dimension to be used for that feature.

    no_of_cont: Integer
      The number of continuous features in the data.

    lin_layer_sizes: List of integers.
      The size of each linear layer. The length will be equal
      to the total number
      of linear layers in the network.

    output_size: Integer
      The size of the final output.

    emb_dropout: Float
      The dropout to be used after the embedding layers.

    lin_layer_dropouts: List of floats
      The dropouts to be used after each linear layer.
    """

    super().__init__()

    # Embedding layers
    self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                     for x, y in emb_dims])

    no_of_embs = sum([y for x, y in emb_dims])
    self.no_of_embs = no_of_embs
    self.no_of_cont = no_of_cont

    # Linear Layers
    first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                lin_layer_sizes[0])

    self.lin_layers =\
     nn.ModuleList([first_lin_layer] +\
          [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
           for i in range(len(lin_layer_sizes) - 1)])
    
    for lin_layer in self.lin_layers:
      nn.init.kaiming_normal_(lin_layer.weight.data)

    # Output Layer
    self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                  output_size)
    nn.init.kaiming_normal_(self.output_layer.weight.data)

    # Batch Norm Layers
    self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
    self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                    for size in lin_layer_sizes])

    # Dropout Layers
    self.emb_dropout_layer = nn.Dropout(emb_dropout)
    self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                  for size in lin_layer_dropouts])

  def forward(self, cont_data, cat_data):

    if self.no_of_embs != 0:
      x = [emb_layer(cat_data[:, i])
           for i,emb_layer in enumerate(self.emb_layers)]
      x = torch.cat(x, 1)
      x = self.emb_dropout_layer(x)

    if self.no_of_cont != 0:
      normalized_cont_data = self.first_bn_layer(cont_data)

      if self.no_of_embs != 0:
        x = torch.cat([x, normalized_cont_data], 1) 
      else:
        x = normalized_cont_data

    for lin_layer, dropout_layer, bn_layer in\
        zip(self.lin_layers, self.droput_layers, self.bn_layers):
      
      x = F.relu(lin_layer(x))
      x = bn_layer(x)
      x = dropout_layer(x)

    x = self.output_layer(x)

    return x

In [13]:
X, num_cols, cat_cols  = create_X(df)


X = pd.merge(X,train_target, on='building_id')
print(X.shape, train_target.shape)

X['damage_grade'] = X['damage_grade'] - 1

# X.drop(columns=['building_id'], axis=1, inplace=True)

cat_dims = [int(X[col].nunique()) for col in cat_cols]
print(cat_dims)
# [15, 5, 2, 4, 112]
emb_dims = [(x, min(128, (x + 1) // 2)) for x in cat_dims]
print(emb_dims)
# [(15, 8), (5, 3), (2, 1), (4, 2), (112, 50)]

(260601, 40) (260601, 2)
[31, 1414, 11595, 3, 5, 3, 5, 4, 4, 10, 4]
[(31, 16), (1414, 128), (11595, 128), (3, 2), (5, 3), (3, 2), (5, 3), (4, 2), (4, 2), (10, 5), (4, 2)]


In [9]:
from sklearn.preprocessing import StandardScaler

In [14]:
X[num_cols] = StandardScaler().fit_transform(X[num_cols])

In [19]:
for i in num_cols:
    X[i+'_sq'] = X[i]**2

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
                      

In [26]:
## no of count
X.shape[1]-len(cat_dims)

57

In [27]:
model = FeedForwardNN(emb_dims, no_of_cont=57, lin_layer_sizes=[100, 50],
                          output_size=3, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)

In [12]:
## tying up everything

In [28]:
def score_tensors(y, preds):
    preds = preds.argmax(1)
    preds = preds.reshape(-1)
    y = y.reshape(-1)

    preds = preds.cpu().numpy()
    y = y.cpu().numpy()
    
    score = f1_score(y, preds, average='micro')
    
    return score
 

In [14]:
def f1_micro(true:torch.Tensor, predicted:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    
    '''
    assert true.ndim == 1
    assert predicted.ndim == 1 or predicted.ndim == 2
    
    if predicted.ndim == 2:
        predicted = predicted.argmax(dim=1)
    
    tp_all = []
    tn_all = []
    fp_all = []
    fn_all = []
    
    for c in true.unique():
        y_true = true.clone()
        y_pred = predicted.clone()
        
        b1 = y_true==c
        b0 = y_true!=c
        y_true[b1] = 1
        y_true[b0] = 0
        
        b1 = y_pred==c
        b0 = y_pred!=c
        y_pred[b1] = 1
        y_pred[b0] = 0
        
        
        tp = (y_true * y_pred).sum().to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
        
        tp_all.append(tp); tn_all.append(tn)
        fp_all.append(fp); fn_all.append(fn)
    
    tp_all = torch.stack(tp_all).sum() ; tn_all = torch.stack(tn_all).sum()
    fp_all = torch.stack(fp_all).sum() ; fn_all = torch.stack(fn_all).sum()
    epsilon = 1e-7

    precision = tp_all / (tp_all + fp_all + epsilon)
    recall = tp_all / (tp_all + fn_all + epsilon)

    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    return -f1

In [15]:
def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    return f1

In [29]:
from torch.utils.data.sampler import SubsetRandomSampler


In [30]:
train_indices, val_indices = train_test_split(X.index.values, train_size=0.8, random_state=99)

In [31]:
train_dataset = TabularDataset(data=X.iloc[train_indices], cat_cols=cat_cols,
                             output_col='damage_grade')
val_dataset = TabularDataset(data=X.iloc[val_indices], cat_cols=cat_cols,
                             output_col='damage_grade')

In [32]:
batchsize = 64
train_dataloader = DataLoader(train_dataset, batchsize, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batchsize, shuffle=True, num_workers=4)

In [33]:
len(train_dataloader), len(val_dataloader)

(3258, 815)

In [34]:
ids, _ = train_test_split(X.index.values, train_size=0.001, random_state=99)
baby_dataset = TabularDataset(data=X.iloc[ids], cat_cols=cat_cols,
                             output_col='damage_grade')
baby_dataloader = DataLoader(baby_dataset, batchsize, shuffle=True, num_workers=4)


In [35]:
def validate(model, dataloader):
    
    predictions = []
    actuals = []
    with torch.no_grad():
        for y, cont_x, cat_x in dataloader:
#             print(cat_x.device)
            cat_x = cat_x.to(device)
            cont_x = cont_x.to(device)
            y  = y.to(device)
            batch_preds = model(cont_x, cat_x)

            predictions.append(batch_preds)
            actuals.append(y)
        
    predictions = torch.cat(predictions, 0)
    actuals = torch.cat(actuals, 0)
    
    return score_tensors(actuals, predictions)
        
        

In [36]:
for y, cont_x, cat_x in train_dataloader:
    cat_x = cat_x.to(device)
    cont_x = cont_x.to(device)
    y  = y.to(device)
    break


In [24]:
preds = model(cont_x, cat_x)

In [39]:
model = FeedForwardNN(emb_dims, no_of_cont=56, lin_layer_sizes=[100, 50],
                          output_size=3, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)

In [40]:
no_of_epochs = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in baby_dataloader:

        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)

        # Forward Pass
        preds = model(cont_x, cat_x)
#         print(y.ndim,y.shape
        loss = criterion(target=y.long().reshape(-1), input=preds)
        
        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    val_score = validate(model, baby_dataloader) 
    print(f"epoch = {epoch} f1-score = {val_score}")
    

epoch = 0 f1-score = 0.5807692307692308
epoch = 1 f1-score = 0.7961538461538461
epoch = 2 f1-score = 0.9423076923076923
epoch = 3 f1-score = 0.9615384615384616
epoch = 4 f1-score = 0.9423076923076923
epoch = 5 f1-score = 0.9884615384615385
epoch = 6 f1-score = 0.9884615384615385
epoch = 7 f1-score = 0.9846153846153847
epoch = 8 f1-score = 0.9923076923076923
epoch = 9 f1-score = 0.9846153846153847
epoch = 10 f1-score = 0.9653846153846154
epoch = 11 f1-score = 0.9576923076923077
epoch = 12 f1-score = 0.9653846153846154


KeyboardInterrupt: 

In [2]:
batchsize = 1024
train_dataloader = DataLoader(train_dataset, batchsize, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batchsize, shuffle=True, num_workers=4)

NameError: name 'DataLoader' is not defined

[(31, 31), (1414, 64), (11595, 64), (3, 3), (5, 5), (3, 3), (5, 5), (4, 4), (4, 4), (10, 10), (4, 4)]


In [196]:
X.damage_grade.value_counts(normalize=True)

1    0.568912
2    0.334680
0    0.096408
Name: damage_grade, dtype: float64

In [207]:
weights_tensor = torch.Tensor([0.1,0.55,0.34])

In [208]:
weights_tensor = weights_tensor.to(device)

In [209]:
weights_tensor = 1/weights_tensor

In [1]:
weights_tensor = weights_tensor.double()

NameError: name 'weights_tensor' is not defined

In [43]:
no_of_epochs = 30
criterion = nn.CrossEntropyLoss(weight=weights_tensor,)
optimizer = torch.optim.Adam(model.parameters(), lr= 0.03)

NameError: name 'weights_tensor' is not defined

In [234]:
emb_dims = [(x, min(512, 2*x)) for x in cat_dims]
print(emb_dims)

[(31, 62), (1414, 512), (11595, 512), (3, 6), (5, 10), (3, 6), (5, 10), (4, 8), (4, 8), (10, 20), (4, 8)]


In [41]:
model = FeedForwardNN(emb_dims, no_of_cont=56, lin_layer_sizes=[100, 50],
                          output_size=3, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)

In [1]:
for epoch in range(30):
    train_loss = 0
    for y, cont_x, cat_x in train_dataloader:

        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)

        # Forward Pass
        preds = model(cont_x, cat_x)
        loss = criterion(target=y.long().reshape(-1), input=preds)
        train_loss += loss.item()
        
        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    val_score = validate(model, val_dataloader) 
    print(f"epoch = {epoch} f1-score = {val_score}")
    print(f"train loss = {train_loss}")
    

NameError: name 'train_dataloader' is not defined

In [None]:
no_of_epochs = 30
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in train_dataloader:

        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)

        # Forward Pass
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y.long().reshape(-1))
        
        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    val_score = validate(model, val_dataloader) 
    print(f"epoch = {epoch} f1-score = {val_score}")
    

In [None]:
batchsize = 2048
train_dataloader = DataLoader(train_dataset, batchsize, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batchsize, shuffle=True, num_workers=4)

In [None]:
model = FeedForwardNN(emb_dims, no_of_cont=27, lin_layer_sizes=[100, 50],
                          output_size=3, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)

In [None]:
no_of_epochs = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in train_dataloader:

        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)

        # Forward Pass
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y.long().reshape(-1))
        
        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    val_score = validate(model, val_dataloader) 
    print(f"epoch = {epoch} f1-score = {val_score}")
    

In [None]:
model = FeedForwardNN(emb_dims, no_of_cont=27, lin_layer_sizes=[100, 50],
                          output_size=3, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)

In [None]:
no_of_epochs = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in train_dataloader:

        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)

        # Forward Pass
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y.long().reshape(-1))
        
        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    val_score = validate(model, val_dataloader) 
    print(f"epoch = {epoch} f1-score = {val_score}")
    

In [None]:
model = FeedForwardNN(emb_dims, no_of_cont=27, lin_layer_sizes=[100, 50],
                          output_size=3, emb_dropout=0.04,
                          lin_layer_dropouts=[0.1,0.1]).to(device)

In [86]:
batchsize = 1024
train_dataloader = DataLoader(train_dataset, batchsize, shuffle=True, num_workers=1)
val_dataloader = DataLoader(val_dataset, batchsize, shuffle=True, num_workers=1)

In [87]:
no_of_epochs = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(no_of_epochs):
    for y, cont_x, cat_x in train_dataloader:

        cat_x = cat_x.to(device)
        cont_x = cont_x.to(device)
        y  = y.to(device)

        # Forward Pass
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y.long().reshape(-1))
        
        # Backward Pass and Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    val_score = validate(model, val_dataloader) 
    print(f"epoch = {epoch} f1-score = {val_score}")
    

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fd24b82d3b0>
Traceback (most recent call last):
  File "/data/nithish/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 926, in __del__
    self._shutdown_workers()
  File "/data/nithish/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 906, in _shutdown_workers
    w.join()
  File "/data/nithish/anaconda3/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fd24b82d3b0>
Traceback (most recent call last):
  File "/data/nithish/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 926, in __del__
    self._shutdown_workers()
  File "/data/nithish/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 906, in _shutd

KeyboardInterrupt: 