In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE, ADASYN

%matplotlib inline

In [2]:
train_values = pd.read_csv('./dataset_modified/train_values.csv')
train_labels = pd.read_csv('./dataset_modified/train_labels.csv')
test_values = pd.read_csv('./dataset_modified/test_values.csv')

In [39]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(train_values, train_labels['damage_grade'], test_size=0.3, random_state=10) # 70% training and 30% test
X_train = X_train.drop(columns = 'building_id')
X_test  = X_test.drop (columns = 'building_id')

In [10]:
def model(X_train, y_train, X_test):
    # XGBoost Model

    # xgboost classifier accepts classes only in the range [0, num_class)
    y_train_xgb = y_train - 1
    y_test_xgb = y_test - 1

    # transforming data into DMatrix for it to be usable by XGBoost
    D_train = xgb.DMatrix(X_train, label = y_train_xgb)
    D_test = xgb.DMatrix(X_test, label = y_test_xgb)

    param = {
        'eta': 0.2,
        'subsample': 0.7, 
        'min_child_weight': 5, 
        'max_depth': 12, 
        'gamma': 0.4, 
        'colsample_bytree': 0.7,
        'objective': 'multi:softmax',
        'num_class': 3
    }

    steps = 100

    # training the model
    clf = xgb.train(param, D_train, steps)

    # predicting damage_grade for the test values
    y_pred = clf.predict(D_test)
    y_pred += 1
    
    return y_pred

#     # XGBoost Model Accuracy
#     print("Accuracy of XGBoost: ", accuracy_score(y_test,y_pred))
#     print(classification_report(y_test, y_pred))

In [11]:
def evaluate(y_test, y_pred):
    # XGBoost Model Accuracy
    print("Accuracy of XGBoost: ", accuracy_score(y_test,y_pred))
    print(classification_report(y_test, y_pred))

In [29]:
y_pred = model(X_train, y_train, X_test)

In [30]:
evaluate(y_pred, y_test)

Accuracy of XGBoost:  0.743901971067
              precision    recall  f1-score   support

         1.0       0.51      0.68      0.59      5605
         2.0       0.85      0.74      0.79     50799
         3.0       0.63      0.76      0.69     21777

    accuracy                           0.74     78181
   macro avg       0.66      0.73      0.69     78181
weighted avg       0.76      0.74      0.75     78181



## Log transformation

In [22]:
X_train['age'] = np.log(X_train['age'])
X_test['age'] = np.log(X_test['age'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [23]:
y_pred = model(X_train, y_train, X_test)

In [24]:
evaluate(y_pred, y_test)

Accuracy of XGBoost:  0.727683196685
              precision    recall  f1-score   support

         1.0       0.43      0.66      0.52      4932
         2.0       0.85      0.72      0.78     52155
         3.0       0.60      0.75      0.67     21094

    accuracy                           0.73     78181
   macro avg       0.63      0.71      0.66     78181
weighted avg       0.76      0.73      0.74     78181



## Without dropping building_id

In [25]:
X_train, X_test, y_train, y_test = train_test_split(train_values, train_labels['damage_grade'], test_size=0.3, random_state=10) # 70% training and 30% test

In [26]:
y_pred = model(X_train, y_train, X_test)

In [27]:
evaluate(y_pred, y_test)

Accuracy of XGBoost:  0.742136836316
              precision    recall  f1-score   support

         1.0       0.50      0.69      0.58      5474
         2.0       0.85      0.74      0.79     51227
         3.0       0.62      0.76      0.69     21480

    accuracy                           0.74     78181
   macro avg       0.66      0.73      0.69     78181
weighted avg       0.77      0.74      0.75     78181



## Dropping socio-economic related columns (F1 score = 0.7455)

In [51]:
X_train = train_values
X_test = test_values
y_train = train_labels['damage_grade']

In [52]:
X_train = X_train.drop(['building_id','has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other'], axis=1)
X_test = X_test.drop(['building_id','has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other'], axis=1)

In [46]:
y_pred = model(X_train, y_train, X_test)

In [47]:
result = pd.read_csv('./dataset/submission_format.csv')

In [48]:
result['damage_grade'] = y_pred

In [49]:
result['damage_grade'] = result['damage_grade'].astype(int)

In [50]:
result.to_csv('file4.csv', index=False)

#Resulting Macro F1 Score = 0.7455

In [59]:
X_train = train_values
X_test = test_values
y_train = train_labels['damage_grade']

In [60]:
X_train = X_train.drop(['building_id','has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other', 'count_families', 'has_secondary_use'], axis=1)
X_test = X_test.drop(['building_id','has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other', 'count_families', 'has_secondary_use'], axis=1)

In [61]:
y_pred = model(X_train, y_train, X_test)

In [62]:
result = pd.read_csv('./dataset/submission_format.csv')
result['damage_grade'] = y_pred
result['damage_grade'] = result['damage_grade'].astype(int)
result.to_csv('file5.csv', index=False)

In [63]:
## Resulting F1 score = 0.7444

## Normalizing the numeric columns