In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [3]:
extended_train_features = pd.read_csv('extended_train_features.csv')
extended_train_labels = pd.read_csv('extended_train_labels.csv')

In [4]:
# Remove building id
exclude = ['building_id']
features = extended_train_features.loc[:,[i for i in list(extended_train_features.columns) if i not in exclude]]
features = np.asarray(features)#[index]

labels = extended_train_labels['damage_grade']
labels = np.asarray(labels)#[index]

In [5]:
print(extended_train_features.shape)
print(features.shape)

(258477, 69)
(258477, 68)


### Principal Component Analysis

In [6]:
from sklearn.decomposition import PCA

In [7]:
# Apply PCA
pca = PCA(n_components=40)
principalComponents = pca.fit_transform(features)
print (np.sum(pca.explained_variance_ratio_)) 
print (principalComponents.shape) 
print (labels.shape) 

0.992446150473015
(258477, 40)
(258477,)


### Split data

In [8]:
train_features, test_features, train_labels, test_labels = train_test_split(principalComponents, labels, test_size = 0.3, random_state = 5, shuffle=True)
print('Shape of data')
print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)
print(test_labels.shape)
print("")

Shape of data
(180933, 40)
(180933,)
(77544, 40)
(77544,)



### Random Forest

In [138]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [139]:
#Training the model

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [140]:
# Predict with trained model
predictions = random_forest.predict(test_features)

In [142]:
f1 = f1_score(test_labels, predictions, average = 'micro')
print(f1)

# With 68 features: 0.7112735995047973
# With 68 principal components: 0.6772928917775715
# With 40 principal components: 0.6712447126792531

0.6712447126792531


## Clustering

In [11]:
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering, SpectralClustering, KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, v_measure_score

#### Silhouette

In [None]:
silhouette = []
component_range = list(range(2, 10))

for k in component_range:
    kmeans = KMeans(n_clusters = k).fit(train_features)
    score = silhouette_score(train_features, kmeans.predict(train_features), metric = 'euclidean')
    silhouette.append(score)
    
plt.figure(figsize=(15, 6))
plt.plot(list_k_sil, silhouette, '-o')
plt.title("Silhouette method")
plt.xlabel(r'Number of clusters K')
plt.ylabel('Average silhouette width')