Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.linear_model import LogisticRegressionCV
import xgboost as xgb 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from math import log

%matplotlib inline

ImportError: No module named 'matplotlib'

Importing the dataset

In [None]:
train_values = pd.read_csv('./dataset/train_values.csv')
train_labels = pd.read_csv('./dataset/train_labels.csv')
test_values = pd.read_csv('./dataset/test_values.csv')
submission = pd.read_csv('./dataset/submission_format.csv')

Understanding the dataset

In [None]:
train_values.shape

In [None]:
test_values.shape

In [None]:
train_values.head().transpose()

In [None]:
train_values.dtypes

Dataset Preprocessing

Check for missing, NaN values

In [None]:
train_values.isnull().values.any()
train_labels.isnull().values.any()

#The dataset contains no missing values.

Check for Outliers

In [None]:
ax = sns.boxplot(x=train_values["age"])

Even though the boxplot for the age of the buildings shows outliers, according to other sources, the buildings in Nepal actually date back to several thousands of years.

In [None]:
data = pd.DataFrame({ 'Train': train_values["area_percentage"], 'Test' : test_values["area_percentage"]})
ax = sns.boxplot(data=data)

In [None]:
data = pd.DataFrame({ 'Train': train_values["height_percentage"], 'Test' : test_values["height_percentage"]})
ax = sns.boxplot(data=data)

We cannot eliminate all the outliers in the train data as the test data also contains outliers similar to the train data.

Converting categorical variables into Numeric form

In [None]:
#Columns with categorical data
num_cols = train_values._get_numeric_data().columns
categories = list(set(train_values.columns) - set(num_cols))
categories

In [None]:
for i in categories:
    print(i, set(train_values[i]))

In [None]:
#One hot Encodings
train_values_new = pd.get_dummies(train_values)
test_values_new = pd.get_dummies(test_values)

print('Training dataset :',train_values_new.shape)
print()
print('Test dataset :',test_values_new.shape)

In [None]:
#Factors Method

#Preparing Training Data
df = train_values
for col_name in categories:
        df[col_name]= df[col_name].astype('category')
        df[col_name] = df[col_name].cat.codes
print("Train Data Shape: ", df.shape)

#Preparing Test Data
df_test = test_values
for col_name in categories:
        df_test[col_name]= df_test[col_name].astype('category')
        df_test[col_name] = df_test[col_name].cat.codes
print("Test Data Shape: ", df_test.shape)

Basic Summary Statistic

In [None]:
df.describe().transpose()

Exploratory Data Analysis

In [None]:
# On plotting a bar plot for each damage grade, we observe that the data is imbalanced.
# Countplot of 'damage_grade' column ( Counts of different damage grades )

sns.set(rc={'figure.figsize':(6,7)})
sns.countplot(train_labels['damage_grade']).set_title("Number of Buildings with Each Damage Grade")
plt.show()
damage_grade_count = train_labels['damage_grade'].value_counts()
print(damage_grade_count)
print()
print('It can be clearly seen that most (about %.2f%%) of the buildings have been moderately (damage_grade = 2) damaged by the earthquake' 
      %(round(damage_grade_count[2]/sum(damage_grade_count)*100,2)))

In [None]:
plt.figure(figsize=(13,11))
data_corr = train_values.corr()
sns.heatmap(data_corr)
plt.show();

In [None]:
# Pairplot of a few selected columns

print('The below plot shows relationship between a few selected columns')
df_merged = train_values.merge(train_labels)
selected_features = ['age',
                 'area_percentage',
               'height_percentage',
               'foundation_type',
               'count_families',
               'has_secondary_use',
                'damage_grade']

df_merged = df_merged[selected_features]

sns.pairplot(data = df_merged, hue = "damage_grade", diag_kind = "hist", kind = "scatter")
plt.show()

In [None]:
# Histogram of 'age' column ( Distribution of age of the buildings )

sns.set(rc={'figure.figsize':(16,8)})
sns.distplot(df.age, bins = 200, kde = True)
plt.title('Distribution of age of the buildings')
plt.show()

# log transformation applied on age column for values greater than 0
# sns.set(rc={'figure.figsize':(16,8)})
# sns.distplot(df.age[df.age>0].apply(log), bins = 40, kde = True)
# plt.title('Distribution of age of the buildings after log transformation')
# plt.show()

num_of_zeros = df['age'].to_list().count(0)
print('Number of zeros(in age column): ',num_of_zeros)
print('26k zeros can imply two things, all those building were built in the past year(pretty unlikely) or that there were some unknown values and they have been filled with 0')
print()
print('From the above histogram we can infer that most of the buildings that were destroyed during the earthquake were new or recently build')

In [None]:
# Countplot of 'has_secondary_use' column ( Number of buildings having secondary use )

sns.set(rc={'figure.figsize':(8,6)})
sns.countplot(df['has_secondary_use'])
plt.title('Number of buildings having secondary use')
plt.xlabel('Secondary Use')
plt.xticks(np.arange(2), ('No', 'Yes'))
plt.show()
secondary_use_count = df['has_secondary_use'].value_counts()
print(secondary_use_count)
print()
print('Only a small percentage (%.2f%%) of the buildings had a secondary use' %(round(secondary_use_count[1]/len(df.has_secondary_use)*100,2)))

In [None]:
# Countplot for 'count_families' column ( frequency of number of families )

sns.set(rc={'figure.figsize':(12,7)})
sns.countplot(df.count_families)
plt.title('Frequency of number of families in a building')
plt.xlabel('Number of families')
plt.ylabel('Frequency')
plt.show()
families_count = df['count_families'].value_counts()
print(families_count)
print()
print('Majority of the buildings (about %.2f%%) had only 1 family living in them' %(round((families_count[1]/len(df.count_families))*100,2)))

In [None]:
# Histogram of 'area_percentage' column   ( Distribution of area percentage of the buildings )

print('area_percentage represents the normalized area of the building footprint')
sns.set(rc={'figure.figsize':(16,8)})
sns.distplot(df.area_percentage)
plt.title('Distribution of area percentage of the buildings')
plt.show()
print('The above distribution of area percentage shows it is positively(right) skewed')
print('So, we can use log transformation to make the distribution more normal')
print()
sns.distplot(df.area_percentage.apply(log))
plt.title('Distribution of area percentage of the buildings after log transformation')
plt.xlabel('log (area_percentage)')
plt.show()


In [None]:
# Histogram of 'height_percentage' column  ( Distribution of height percentage of the buildings )

print('height_percentage represents the normalized height of the building footprint')
sns.set(rc={'figure.figsize':(12,7)})
sns.distplot(df.height_percentage)
plt.title('Distribution of height percentage of the buildings')
plt.show()


In [None]:
# for i in df.columns:
#     #df.hist(column=i, bins=10)
#     sns.distplot(df[i])
#     plt.show()

In [None]:
#sns.pairplot(df);

Basic Models:
1. Logistic Regression
2. Random Forest

In [None]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df, train_labels['damage_grade'], test_size=0.3) # 70% training and 30% test

In [None]:
def print_confusion_matrix(y_train, y_pred):
    cm = confusion_matrix(y_train, y_pred)
    df_cm = pd.DataFrame(cm, range(3), range(3))
    df_cm = pd.DataFrame(cm, columns=np.unique(y_train), index = np.unique(y_train))
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (10,7))
    sns.set(font_scale=1.4)#for label size
    ax = sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})# font size
    ax.get_ylim()
    ax.set_ylim(3.0, 0)

Logistic Regression

In [None]:
clf = LogisticRegression(random_state=1, solver='lbfgs', 
                         multi_class='multinomial').fit(X_train, y_train)

In [None]:
predicted_labels = clf.predict(X_test)

clf.score(X_test, y_test)

In [None]:
print_confusion_matrix(y_test, predicted_labels)

Random Forest

In [None]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

pred_labels = clf.predict(X_test)

In [None]:
# Model Accuracy
print("Accuracy of Random Forest: ",clf.score(X_test, y_test))

In [None]:
print_confusion_matrix(y_test, pred_labels)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)