In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)


In [None]:
# Import libraries here
import sqlite3
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
#from category_encoders import OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted
from sklearn.tree import DecisionTreeClassifier, plot_tree

warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
#to connect to sqlite db
%load_ext sql
%sql sqlite:////home/jovyan/nepal.sqlite

In [None]:
%%sql

select distinct(district_id) from id_map

In [None]:
%%sql
select count(*) from id_map where district_id = 1


In [None]:
%%sql
select distinct(i.building_id) as b_id,b.*,bd.damage_grade 
from id_map as i
join building_structure as b
on b.building_id = i.building_id
join building_damage as bd
on bd.building_id = b.building_id
where district_id = 3
LIMIT 5

In [None]:
# Build your `wrangle` function here
def wrangle(db_path):
    #connect database
    conn = sqlite3.connect(db_path)
    query = """
            select distinct(i.building_id) as b_id,b.*,bd.damage_grade 
            from id_map as i
            join building_structure as b
            on b.building_id = i.building_id
            join building_damage as bd
            on bd.building_id = b.building_id
            where district_id = 3
    """
    #read query result to dataframe
    df = pd.read_sql(query,conn,index_col="b_id")
    
    #caste severe_damage feature
    df["severe_damage"] = df["damage_grade"].str[-1].astype(int)
    df["severe_damage"] = df["severe_damage"].apply(lambda c: 1 if c >3 else 0)
    #multicollinerity columns
    
    
    #drop leaky columns
    leaky_col = [col for col in df.columns if "_post_eq" in col]
    leaky_col.append("count_floors_pre_eq")
    df.drop(columns=leaky_col,inplace=True)
    df.drop(columns="damage_grade",inplace=True)
    df.drop(columns="building_id",inplace=True)
    return df

In [None]:
df = wrangle("/home/jovyan/nepal.sqlite")
df.head()

In [None]:
df.nunique().sort_values()

In [None]:
# Plot value counts of `"severe_damage"`
df["severe_damage"].value_counts(normalize=True).plot(kind="bar")
plt.xlabel("Severe Damage")
plt.ylabel("Relative Frequency")
plt.title("Kavrepalanchok, Class Balance")

In [None]:
sns.boxplot(x="severe_damage",y="plinth_area_sq_ft",data=df)
plt.xlabel("Severe Damage")
plt.ylabel("Plinth Area [sq. ft.]")
plt.title("Kavrepalanchok, Plinth Area vs Building Damage")

In [None]:
roof_pivot =pd.pivot_table(df,index="roof_type",values="severe_damage",aggfunc=np.mean).sort_values(by="severe_damage")
roof_pivot

In [None]:
target = "severe_damage"
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:
acc_baseline = df["severe_damage"].value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))

In [None]:
model_lr = make_pipeline(OneHotEncoder(),LogisticRegression(random_state=42))
model_lr.fit(X_train,y_train)


In [None]:
lr_train_acc = model_lr.score(X_train,y_train)
lr_val_acc = model_lr.score(X_val,y_val)

print("Logistic Regression, Training Accuracy Score:", lr_train_acc)
print("Logistic Regression, Validation Accuracy Score:", lr_val_acc)

In [None]:
depth_hyperparams = range(1, 16)
training_acc = []
validation_acc = []
for d in depth_hyperparams:
    model_dt =make_pipeline(OrdinalEncoder(),DecisionTreeClassifier(max_depth=d,random_state=42))
    model_dt.fit(X_train, y_train)
    t_acc = model_dt.score(X_train, y_train)
    v_acc = model_dt.score(X_val,y_val)
    training_acc.append(t_acc)
    validation_acc.append(v_acc)
print("Training Accuracy Scores:", training_acc)
print("Validation Accuracy Scores:", validation_acc)

In [None]:
pd.Series(training_acc).plot(color="red",label="training")
pd.Series(validation_acc).plot(color="blue",label="validation")
plt.xlabel("Max Depth")
plt.ylabel("Accuracy Score")
plt.title("Validation Curve, Decision Tree Model")
plt.legend()


In [None]:
final_model_dt =make_pipeline(OrdinalEncoder(),DecisionTreeClassifier(max_depth=10,random_state=42))
final_model_dt.fit(X_train, y_train)
final_training_acc = final_model_dt.score(X_train, y_train)
final_validation_acc = final_model_dt.score(X_val,y_val)
print("Training Accuracy Score:", final_training_acc)
print("Validation Accuracy Score:", final_validation_acc)

In [None]:
X_test = pd.read_csv("data/kavrepalanchok-test-features.csv", index_col="b_id")
y_test_pred = final_model_dt.predict(X_test)
y_test_pred[:5]

In [None]:
features = X_train.columns
importance=final_model_dt.named_steps["decisiontreeclassifier"].feature_importances_
feat_imp = pd.Series(importance,index=features).sort_values()
feat_imp.head()

In [None]:
feat_imp.plot(kind="barh")
plt.xlabel("Gini Importance")
plt.ylabel("Label")
plt.title("Kavrepalanchok Decision Tree, Feature Importance")