In [None]:
import numpy as np
import pandas as pd

# from sklearn.tree import DecisionTreeClassifier
# from sklearn.tree import DecisionTreeRegressor

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestRegressor

# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

from catboost import CatBoostClassifier
from catboost import Pool

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_val = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")

base_string = "geo_"
num_columns = 16  # You can change this to the number of columns you want
column_list = [base_string + str(i) for i in range(1, num_columns + 1)]

geo_raw_data = pd.read_csv("Geo_Data_Train.csv", header=None)

print(train_val.shape)
print(train_labels.shape)
print(geo_raw_data.shape)

In [None]:
geo_raw_data.columns = column_list
geo_raw_data.info()

In [None]:
train_val[train_val.columns[train_val.nunique() <= 2]] = train_val[train_val.columns[train_val.nunique() <= 2]].astype("category")

train_val = train_val.drop(["geo_level_1_id", "geo_level_2_id", "geo_level_3_id"], axis = 1)
train_val = train_val.drop(["building_id"], axis=1)

In [None]:
cat_cols=['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other']

In [None]:
#train_final = pd.get_dummies(train_val)
train_final = train_val
train_final.info()

In [None]:
train_merged = pd.concat([train_val, geo_raw_data],ignore_index=False,axis=1)
train_merged.shape

In [None]:
del(train_final)
del(geo_raw_data)
train_merged.info()

In [None]:
train_labels["target_1"] = np.where(train_labels["damage_grade"] >= 2, True, False)
train_labels["target_2"] = np.where(train_labels["damage_grade"] >= 3, True, False)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(train_merged,train_labels, random_state=31, test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train,Y_train, random_state=31, test_size=0.15)
print(X_train.shape)
print(X_valid.shape)
print(Y_train.shape)
print(Y_valid.shape)

train_data = Pool(data=X_train,
                  label=Y_train["damage_grade"],
                  cat_features=cat_cols
                 )

valid_data = Pool(data=X_valid,
                  label=Y_valid["damage_grade"],
                  cat_features=cat_cols
                 )

# Classification

In [None]:
clf = CatBoostClassifier(cat_features=cat_cols)# Fit model
clf.fit(train_data,
        eval_set = valid_data)

In [None]:
pred_cat_boost_class = clf.predict(X_train)
pred_cat_boost_class_test = clf.predict(X_test)

print(confusion_matrix(Y_train["damage_grade"],pred_cat_boost_class))
print(confusion_matrix(Y_test["damage_grade"],pred_cat_boost_class_test))
print(accuracy_score(Y_train["damage_grade"],pred_cat_boost_class))
print(accuracy_score(Y_test["damage_grade"],pred_cat_boost_class_test))
# print(f1_score(Y_train["damage_grade"],pred_rf_class_best,average="micro"))
# print(f1_score(Y_test["damage_grade"],pred_rf_class_best_test,average="micro"))
print(classification_report(Y_train["damage_grade"],pred_cat_boost_class))
print(classification_report(Y_test["damage_grade"],pred_cat_boost_class_test))

# Ordinal Classification

In [None]:
train_data_2 = Pool(data=X_train,
                  label=Y_train["target_1"],
                  cat_features=cat_cols
                 )

valid_data_2 = Pool(data=X_valid,
                  label=Y_valid["target_1"],
                  cat_features=cat_cols
                 )

train_data_3 = Pool(data=X_train,
                  label=Y_train["target_2"],
                  cat_features=cat_cols
                 )

valid_data_3 = Pool(data=X_valid,
                  label=Y_valid["target_2"],
                  cat_features=cat_cols
                 )

In [None]:
clf_2 = CatBoostClassifier(cat_features=cat_cols)# Fit model
clf_2.fit(train_data_2,
        eval_set = valid_data_2)

In [None]:
clf_3 = CatBoostClassifier(cat_features=cat_cols)# Fit model
clf_3.fit(train_data_3,
        eval_set = valid_data_3)

In [None]:
prob_23 = clf_2.predict_proba(X_train)
prob_23_test = clf_2.predict_proba(X_test)

prob_3 = clf_3.predict_proba(X_train)
prob_3_test = clf_3.predict_proba(X_test)

In [None]:

grade_1 = prob_23[:,0]
grade_2 = prob_23[:,1] - prob_3[:,1]
grade_3 = prob_3[:,1]
sum_prob = grade_1 + grade_2 + grade_3

grade_probs = np.array([grade_1,grade_2,grade_3])
grade_probs = grade_probs.T
pred_ord = np.argmax(grade_probs,axis=1)+1

grade_1_test = prob_23_test[:,0]
grade_2_test = prob_23_test[:,1] - prob_3_test[:,1]
grade_3_test = prob_3_test[:,1]
sum_prob = grade_1_test + grade_2_test + grade_3_test

grade_probs_test = np.array([grade_1_test,grade_2_test,grade_3_test])
grade_probs_test = grade_probs_test.T
pred_ord_test = np.argmax(grade_probs_test,axis=1)+1

In [None]:
print(confusion_matrix(Y_train["damage_grade"],pred_ord))
print(confusion_matrix(Y_test["damage_grade"],pred_ord_test))
print(accuracy_score(Y_train["damage_grade"],pred_ord))
print(accuracy_score(Y_test["damage_grade"],pred_ord_test))
# print(f1_score(Y_train["damage_grade"],pred_ord,average="micro"))
# print(f1_score(Y_test["damage_grade"],pred_ord_test,average="micro"))
print(classification_report(Y_train["damage_grade"],pred_ord))
print(classification_report(Y_test["damage_grade"],pred_ord_test))