In [None]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

from sys import getsizeof

In [None]:
train_val = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")

base_string = "geo_"
num_columns = 16  # You can change this to the number of columns you want
column_list = [base_string + str(i) for i in range(1, num_columns + 1)]

geo_raw_data = pd.read_csv("Geo_Data_Train.csv", header=None)

print(train_val.shape)
print(train_labels.shape)
print(geo_raw_data.shape)

In [None]:
geo_raw_data.columns = column_list
geo_raw_data.info()

In [None]:
train_val[train_val.columns[train_val.nunique() <= 2]] = train_val[train_val.columns[train_val.nunique() <= 2]].astype("category")
train_val = train_val.drop(["geo_level_1_id", "geo_level_2_id", "geo_level_3_id"], axis = 1)
train_val = train_val.drop(["building_id"], axis=1)

In [None]:
cat_cols=['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other']

In [None]:
train_final = pd.get_dummies(train_val)
train_final.info()

In [None]:
train_merged = pd.concat([train_final, geo_raw_data],ignore_index=True,axis=1)
train_merged.shape

In [None]:
del(train_final)
del(geo_raw_data)
train_merged.info()

In [None]:
train_labels["target_1"] = np.where(train_labels["damage_grade"] >= 2, True, False)
train_labels["target_2"] = np.where(train_labels["damage_grade"] >= 3, True, False)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(train_merged,train_labels, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

# Classification

In [None]:
dec_tree_class_single = DecisionTreeClassifier(random_state=24, max_depth=20)
dec_tree_class_single.fit(X_train,Y_train["damage_grade"])
pred_tree_class_single = dec_tree_class_single.predict(X_train)
pred_tree_class_test = dec_tree_class_single.predict(X_test)

In [None]:
print(confusion_matrix(Y_train["damage_grade"],pred_tree_class_single))
print(accuracy_score(Y_train["damage_grade"],pred_tree_class_single))
print(accuracy_score(Y_test["damage_grade"],pred_tree_class_test))
print(f1_score(Y_train["damage_grade"],pred_tree_class_single,average="micro"))
print(f1_score(Y_test["damage_grade"],pred_tree_class_test,average="micro"))

In [None]:
params = [{"max_depth": [10,20,30,40,50],"min_samples_split": [2,10,20,30,50],"random_state": [24]}]
cv = GridSearchCV(DecisionTreeClassifier(), params, cv = 5, verbose = 2, n_jobs = 2)
cv.fit(X_train,Y_train["damage_grade"])

In [None]:
print(cv.best_params_)
#print(cv.cv_results_)
cv_res = pd.concat([pd.DataFrame(cv.cv_results_["params"]),pd.DataFrame(cv.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
cv_res.to_csv("DecTree_Class.csv")

In [None]:
cv.cv_results_

In [None]:
dec_tree_class_best = DecisionTreeClassifier(random_state=24, max_depth=30, min_samples_split=50)
dec_tree_class_best.fit(X_train,Y_train["damage_grade"])
pred_tree_class_best = dec_tree_class_best.predict(X_train)
pred_tree_class_best_test = dec_tree_class_best.predict(X_test)

print(confusion_matrix(Y_train["damage_grade"],pred_tree_class_best))
print(confusion_matrix(Y_test["damage_grade"],pred_tree_class_best_test))
print(accuracy_score(Y_train["damage_grade"],pred_tree_class_best))
print(accuracy_score(Y_test["damage_grade"],pred_tree_class_best_test))
# print(f1_score(Y_train["damage_grade"],pred_tree_class_best,average="micro"))
# print(f1_score(Y_test["damage_grade"],pred_tree_class_best_test,average="micro"))
print(classification_report(Y_train["damage_grade"],pred_tree_class_best))
print(classification_report(Y_test["damage_grade"],pred_tree_class_best_test))

# Regression

In [None]:
dec_tree_reg_single = DecisionTreeRegressor(random_state=24, max_depth=10)
dec_tree_reg_single.fit(X_train,Y_train["damage_grade"])
pred_tree_reg_single = dec_tree_reg_single.predict(X_train)
pred_tree_reg_single_test = dec_tree_reg_single.predict(X_test)

pred_tree_reg_single_round = np.round(pred_tree_reg_single)
pred_tree_reg_single_round_test =  np.round(pred_tree_reg_single_test)

In [None]:
print(confusion_matrix(Y_train["damage_grade"],pred_tree_reg_single_round))
print(accuracy_score(Y_train["damage_grade"],pred_tree_reg_single_round))
print(accuracy_score(Y_test["damage_grade"],pred_tree_reg_single_round_test))
print(f1_score(Y_train["damage_grade"],pred_tree_reg_single_round,average="micro"))
print(f1_score(Y_test["damage_grade"],pred_tree_reg_single_round_test,average="micro"))

In [None]:
#params = [{"max_depth": [10,20,30,40,50]}]
params = [{"max_depth": [10,20,30,40,50],"min_samples_split": [2,10,20,30,50]}]
cv_reg = GridSearchCV(DecisionTreeRegressor(random_state=24), params, cv = 5, n_jobs=2)
cv_reg.fit(X_train,Y_train["damage_grade"])

In [None]:
print(cv_reg.best_params_)
#print(cv_reg.cv_results_)
cv_res2 = pd.concat([pd.DataFrame(cv_reg.cv_results_["params"]),pd.DataFrame(cv_reg.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
cv_res2.to_csv("DecTree_Reg.csv")

In [None]:
cv_reg.cv_results_

In [None]:
dec_tree_reg_best = DecisionTreeRegressor(random_state=24, max_depth=20, min_samples_leaf= 50)
dec_tree_reg_best.fit(X_train,Y_train["damage_grade"])
pred_tree_reg_best = dec_tree_reg_best.predict(X_train)
pred_tree_reg_best_test = dec_tree_reg_best.predict(X_test)

pred_tree_reg_best_round = np.round(pred_tree_reg_best)
pred_tree_reg_best_round_test =  np.round(pred_tree_reg_best_test)

print(confusion_matrix(Y_train["damage_grade"],pred_tree_reg_best_round))
print(confusion_matrix(Y_test["damage_grade"],pred_tree_reg_best_round_test))
print(accuracy_score(Y_train["damage_grade"],pred_tree_reg_best_round))
print(accuracy_score(Y_test["damage_grade"],pred_tree_reg_best_round_test))
# print(f1_score(Y_train["damage_grade"],pred_tree_reg_best_round,average="micro"))
# print(f1_score(Y_test["damage_grade"],pred_tree_reg_best_round_test,average="micro"))
print(classification_report(Y_train["damage_grade"],pred_tree_reg_best_round))
print(classification_report(Y_test["damage_grade"],pred_tree_reg_best_round_test))

# Ordinal Classification

In [None]:
dec_tree_ord_1 = DecisionTreeClassifier(random_state=24, max_depth=30, min_samples_split=50)
dec_tree_ord_1.fit(X_train,Y_train["target_1"])
pred_tree_ord_1 = dec_tree_ord_1.predict(X_train)
pred_tree_ord_test_1 = dec_tree_ord_1.predict(X_test)
prob_23 = dec_tree_ord_1.predict_proba(X_train)
prob_23_test = dec_tree_ord_1.predict_proba(X_test)

In [None]:
dec_tree_ord_2 = DecisionTreeClassifier(random_state=24, max_depth=30, min_samples_split=50)
dec_tree_ord_2.fit(X_train,Y_train["target_2"])
pred_tree_ord_2 = dec_tree_ord_2.predict(X_train)
pred_tree_ord_test_2 = dec_tree_ord_2.predict(X_test)
prob_3 = dec_tree_ord_2.predict_proba(X_train)
prob_3_test = dec_tree_ord_2.predict_proba(X_test)

In [None]:
grade_1 = prob_23[:,0]
grade_2 = prob_23[:,1] - prob_3[:,1]
grade_3 = prob_3[:,1]
sum_prob = grade_1 + grade_2 + grade_3

In [None]:
grade_probs = np.array([grade_1,grade_2,grade_3])
grade_probs = grade_probs.T
pred_ord = np.argmax(grade_probs,axis=1)+1

In [None]:
grade_1_test = prob_23_test[:,0]
grade_2_test = prob_23_test[:,1] - prob_3_test[:,1]
grade_3_test = prob_3_test[:,1]
sum_prob = grade_1_test + grade_2_test + grade_3_test

grade_probs_test = np.array([grade_1_test,grade_2_test,grade_3_test])
grade_probs_test = grade_probs_test.T
pred_ord_test = np.argmax(grade_probs_test,axis=1)+1

In [None]:
print(confusion_matrix(Y_train["damage_grade"],pred_ord))
print(confusion_matrix(Y_test["damage_grade"],pred_ord_test))
print(accuracy_score(Y_train["damage_grade"],pred_ord))
print(accuracy_score(Y_test["damage_grade"],pred_ord_test))
# print(f1_score(Y_train["damage_grade"],pred_ord,average="micro"))
# print(f1_score(Y_test["damage_grade"],pred_ord_test,average="micro"))
print(classification_report(Y_train["damage_grade"],pred_ord))
print(classification_report(Y_test["damage_grade"],pred_ord_test))