In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model,load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

In [2]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
#             return metrics.roc_auc_score(y_true, y_pred)
            return metrics.f1_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)
#     return metrics.roc_auc_score(y_true, y_pred)

In [3]:
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values+1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
        
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    y = layers.Dense(4, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=y)
    
    return model
    
    

In [None]:
# experiment
model = create_model(data, features)
model

In [None]:
model.summary()

In [4]:
data_dir = '/home/anubhav/Downloads/Kaggle_Runs/HackerEarth_Airplane/'

In [5]:
pd.set_option('display.max_columns', 30)

In [6]:
train = pd.read_csv(f'{data_dir}train.csv')
test = pd.read_csv(f'{data_dir}test.csv')
sample = pd.read_csv(f'{data_dir}sample_submission.csv')

In [7]:
train.Severity.unique()

array(['Minor_Damage_And_Injuries', 'Significant_Damage_And_Fatalities',
       'Significant_Damage_And_Serious_Injuries',
       'Highly_Fatal_And_Damaging'], dtype=object)

In [8]:
train.Severity.replace(to_replace = ['Minor_Damage_And_Injuries', 'Significant_Damage_And_Fatalities',
                                     'Significant_Damage_And_Serious_Injuries','Highly_Fatal_And_Damaging'],
                       value = [0, 1, 2, 3], inplace = True)

# test.Severity.replace(to_replace = ['Minor_Damage_And_Injuries', 'Significant_Damage_And_Fatalities',
#                                      'Significant_Damage_And_Serious_Injuries','Highly_Fatal_And_Damaging'],
#                        value = [0, 1, 2, 3], inplace = True)

## A Bit of EDA

In [None]:
train.head()

In [None]:
train['Severity'].unique()

In [None]:
len(train)

In [None]:
train['Total_Safety_Complaints'].nunique()

In [None]:
len(sample) == len(test)

## Back to training loop

In [9]:
test.head()

Unnamed: 0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
0,19.497717,16,6,72.151322,0.388959,78.32,4,37949.724386,2,0.069692,1
1,58.173516,15,3,64.585232,0.250841,78.6,7,30194.805567,2,0.002777,10
2,33.287671,15,3,64.721969,0.336669,86.96,6,17572.925484,1,0.004316,14
3,3.287671,21,5,66.362808,0.421775,80.86,3,40209.186341,2,0.19999,17
4,10.86758,18,2,56.107566,0.313228,79.22,2,35495.525408,2,0.483696,21


In [10]:
test['Severity'] = -1
data = pd.concat([train, test]).reset_index(drop=True)

# features = [x for x in train.columns if x not in ['target']]
features = ['Days_Since_Inspection', 'Total_Safety_Complaints', 'Accident_Type_Code', 'Violations']
for feat in features:
    label_enc = preprocessing.LabelEncoder()
    data[feat] = label_enc.fit_transform(data[feat].fillna('-1').
                                         astype(str).values)

In [11]:
data

Unnamed: 0,Accident_ID,Accident_Type_Code,Adverse_Weather_Metric,Cabin_Temperature,Control_Metric,Days_Since_Inspection,Max_Elevation,Safety_Score,Severity,Total_Safety_Complaints,Turbulence_In_gforces,Violations
0,7570,1,0.424352,78.04,71.285324,5,31335.476824,49.223744,0,15,0.272118,3
1,12128,1,0.352350,84.54,72.288058,1,26024.711057,62.465753,0,20,0.423939,2
2,2181,6,0.003364,78.86,66.362808,4,39269.053927,63.059361,1,8,0.322604,3
3,5946,2,0.211728,81.79,74.703737,2,42771.499200,48.082192,2,43,0.337029,1
4,9054,2,0.176883,77.16,47.948952,4,35509.228515,26.484018,1,18,0.541140,2
5,10947,1,0.394505,78.63,73.336372,6,29288.448105,43.333333,0,0,0.432954,1
6,4717,3,0.058872,88.04,62.853236,6,32342.109345,22.009132,3,2,0.314671,0
7,8008,3,0.040419,83.48,60.802188,9,22547.195242,24.703196,0,34,0.548353,2
8,8179,1,0.619186,78.24,69.234275,18,45751.983413,54.840183,3,35,0.335587,0
9,6324,0,1.389295,79.56,57.429353,19,37933.399001,52.237443,2,34,0.465410,1


In [12]:
features

['Days_Since_Inspection',
 'Total_Safety_Complaints',
 'Accident_Type_Code',
 'Violations']

In [13]:
len(train.columns)

12

In [14]:
len(test.columns)

12

In [15]:
len(data.columns)

12

In [16]:
len(train), len(test), len(data)

(10000, 2500, 12500)

In [17]:
train = data[data.Severity != -1].reset_index(drop=True)
test = data[data.Severity == -1].reset_index(drop=True)

test_data = [test.loc[:, features].values[:, k] for k in 
             range(test.loc[:, features].values.shape[1])]

In [18]:
# test.loc[:, features]

In [19]:
((test.loc[:, features]).values[:, 0])

array([7, 6, 6, ..., 5, 4, 2])

In [20]:
(test.loc[:, features].values.shape)

(2500, 4)

In [21]:
# (train.target.values)

In [22]:
features

['Days_Since_Inspection',
 'Total_Safety_Complaints',
 'Accident_Type_Code',
 'Violations']

In [23]:
train

Unnamed: 0,Accident_ID,Accident_Type_Code,Adverse_Weather_Metric,Cabin_Temperature,Control_Metric,Days_Since_Inspection,Max_Elevation,Safety_Score,Severity,Total_Safety_Complaints,Turbulence_In_gforces,Violations
0,7570,1,0.424352,78.04,71.285324,5,31335.476824,49.223744,0,15,0.272118,3
1,12128,1,0.352350,84.54,72.288058,1,26024.711057,62.465753,0,20,0.423939,2
2,2181,6,0.003364,78.86,66.362808,4,39269.053927,63.059361,1,8,0.322604,3
3,5946,2,0.211728,81.79,74.703737,2,42771.499200,48.082192,2,43,0.337029,1
4,9054,2,0.176883,77.16,47.948952,4,35509.228515,26.484018,1,18,0.541140,2
5,10947,1,0.394505,78.63,73.336372,6,29288.448105,43.333333,0,0,0.432954,1
6,4717,3,0.058872,88.04,62.853236,6,32342.109345,22.009132,3,2,0.314671,0
7,8008,3,0.040419,83.48,60.802188,9,22547.195242,24.703196,0,34,0.548353,2
8,8179,1,0.619186,78.24,69.234275,18,45751.983413,54.840183,3,35,0.335587,0
9,6324,0,1.389295,79.56,57.429353,19,37933.399001,52.237443,2,34,0.465410,1


In [24]:
test.index.values

array([   0,    1,    2, ..., 2497, 2498, 2499])

In [25]:
train.loc[:, features]#.values[:, 0]

Unnamed: 0,Days_Since_Inspection,Total_Safety_Complaints,Accident_Type_Code,Violations
0,5,15,1,3
1,1,20,1,2
2,4,8,6,3
3,2,43,2,1
4,4,18,2,2
5,6,0,1,1
6,6,2,3,0
7,9,34,3,2
8,18,35,1,0
9,19,34,0,1


In [26]:
cols = ['Accident_Type_Code', 'Adverse_Weather_Metric',
       'Cabin_Temperature', 'Control_Metric', 'Days_Since_Inspection',
       'Max_Elevation', 'Safety_Score', 'Severity', 'Total_Safety_Complaints',
       'Turbulence_In_gforces', 'Violations']

In [None]:
cols, len(cols)

In [29]:
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))

skf = StratifiedKFold(n_splits=5)
count = 0
for train_index, test_index in skf.split(train, train.Severity.values):
    X_train, X_test = train.iloc[train_index, :], train.iloc[test_index, :]
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train, y_test = X_train.Severity.values, X_test.Severity.values
    
    model = create_model(data, features)
    model.compile(loss='categorical_crossentropy', optimizer='adam',
                  metrics = [auc])
    
    X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
    
    X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]
    
    
    es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001,
                                 patience=5, verbose=1, mode='max',
                                baseline=None, restore_best_weights=True)
    rlr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                    patience=3, min_lr=1e-06, mode='max',
                                    verbose=1)
    
    model.fit(X_train, utils.to_categorical(y_train, num_classes=4), 
             validation_data=(X_test, utils.to_categorical(y_test, num_classes=4)),
             verbose=1, batch_size=1024, callbacks=[es, rlr],
             epochs = 20)
    
    vaild_fold_preds = model.predict(X_test)[:, 1]
    test_fold_preds = model.predict(test_data)[:, 1]
    print(model.predict(test_data), len(model.predict(test_data)))
    
    oof_preds[test_index] = vaild_fold_preds.ravel()
    
    test_preds += test_fold_preds.ravel()
    
#     print(metrics.roc_auc_score(y_test, vaild_fold_preds))
    K.clear_session()
#     count +=1
#     if count==3:
#         break

Train on 7999 samples, validate on 2001 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping
[[0.25566018 0.22219132 0.25140297 0.27074552]
 [0.2546141  0.22592756 0.25912237 0.260336  ]
 [0.25654906 0.22128236 0.25878006 0.26338857]
 ...
 [0.25504372 0.22373827 0.25664237 0.2645757 ]
 [0.2533656  0.22583291 0.25462347 0.26617804]
 [0.2553205  0.22166488 0.25469494 0.26831973]] 2500
Train on 7999 samples, validate on 2001 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping
[[0.25337282 0.2261414  0.25836167 0.26212415]
 [0.25631928 0.22742812 0.25895387 0.25729877]
 [0.2573198  0.22584607 0.25951716 0.25731698]
 ...
 [0.2550468  0.22605334 0.25965673 0.25924313]
 [0.25463423 0.22632946 0.26120803 0.2578283 ]
 [0.25620386 0.22665294

In [None]:
print('Overall AUC = {}'.format(metrics.f1_score(train.Severity.values,
                                                     oof_preds)))

In [None]:
test_preds/5

In [None]:
test_preds /= 5
test_ids = test.Accident_ID.values
print("saving submisssion file")

submission = pd.DataFrame.from_dict({'Accident_ID': test_ids,
                                    'Severity': test_preds})

submission.to_csv('submission.csv', index=False)