In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [20]:
def scale(x_train, x_test):
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = pd.DataFrame(scaler.transform(x_train), 
                           index=x_train.index, 
                           columns=x_train.columns)

    x_test = pd.DataFrame(scaler.transform(x_test),
                          index=x_test.index, 
                          columns=x_test.columns)
    return x_train, x_test

def transform_label(y):
    labeled_y = y.apply(lambda row: int(row.damage_grade.split('_')[1])-1, 1).values
    return labeled_y

def transform_3label(y):
    labeled_y = y.apply(lambda row: int(row.damage_grade[1])-1, 1).values
    return labeled_y

def create_nn(lr, end_neurons=5):
    model = keras.Sequential()
    model.add(keras.layers.Dense(500, input_shape=(80,), activation='relu'))
    model.add(keras.layers.Dense(200, activation='relu'))
    model.add(keras.layers.Dense(100, activation='relu'))
    model.add(keras.layers.Dense(50, activation='relu'))
    model.add(keras.layers.Dense(20, activation='relu'))
    model.add(keras.layers.Dense(10, activation='relu'))
    model.add(keras.layers.Dense(end_neurons, activation='softmax'))

    #stochastic gradient descent
    sgd = keras.optimizers.SGD(lr=lr)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [4]:
lr=0.05
epochs=15
batch_size=60

nn_model = create_nn(lr)

## 1. 5-classes classification:

### 1.1. Original set

In [4]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc.csv', index_col='building_id')

In [8]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = transform_label(y.loc[x_train.index]), transform_label(y.loc[x_test.index])

In [9]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [10]:
nn_model.fit(x_train_norm, y_train, 
             epochs=epochs, batch_size=batch_size,
             validation_data=(x_test_norm, y_test))

Train on 609675 samples, validate on 152419 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x25513c8d7c8>

In [20]:
preds = nn_model.predict_proba(x_test_norm)

In [22]:
pd.DataFrame(preds).to_csv('../Project_data/results/nn_preds_orig.csv')

### 1.2 Resampled set:

#### 1.2.1 Over-sampling with SMOTENC

In [23]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc.csv', index_col=0).reset_index(drop=True)

  mask |= (ar1 == a)


In [24]:
y_train = transform_label(y_train)

In [25]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [26]:
nn_model.fit(x_train_norm, y_train, 
             epochs=epochs, batch_size=batch_size,
             validation_data=(x_test_norm, y_test))

Train on 1104100 samples, validate on 152419 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2551369f688>

In [27]:
preds = nn_model.predict(x_test_norm)

In [28]:
pd.DataFrame(preds).to_csv('../Project_data/results/nn_preds_smotenc.csv')

### 1.2.2 Under-sampling: cleaning oversampled dataset

#### 1.2.2.1 Tomek

In [29]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)

In [30]:
y_train = transform_label(y_train)

In [31]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [32]:
nn_model.fit(x_train_norm, y_train,
             epochs=epochs, batch_size=batch_size,
             validation_data=(x_test_norm, y_test))

Train on 897616 samples, validate on 152419 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2551370b5c8>

In [33]:
preds = nn_model.predict(x_test_norm)

In [35]:
pd.DataFrame(preds).to_csv('../Project_data/results/nn_preds_smotenc_tmk.csv')

#### 1.2.2.2 ENN

In [36]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)

In [37]:
y_train = transform_label(y_train)

In [38]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [39]:
nn_model.fit(x_train_norm, y_train, 
             epochs=epochs, batch_size=batch_size,
             validation_data=(x_test_norm, y_test))

Train on 526650 samples, validate on 152419 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x25513f9c0c8>

In [40]:
preds = nn_model.predict(x_test_norm)

In [41]:
pd.DataFrame(preds).to_csv('../Project_data/results/nn_preds_smotenc_enn.csv')

## 2. 3-classes classification:

In [21]:
nn_model = create_nn(lr, end_neurons=3)

In [5]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc_3lab.csv', index_col='building_id')

In [6]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = transform_3label(y.loc[x_train.index]), transform_3label(y.loc[x_test.index])

In [16]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [24]:
nn_model.fit(x_train_norm, y_train, 
             epochs=epochs, batch_size=batch_size,
             validation_data=(x_test_norm, y_test))

Train on 609675 samples, validate on 152419 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x27102671708>

In [25]:
preds = nn_model.predict(x_test_norm)

In [28]:
pd.DataFrame(preds).to_csv('results/nn_preds_3cls.csv')