In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import classification_report, confusion_matrix #for getting CM after testing
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from transforms import add_labels, Preprocessing, BalanceClasses, save_and_drop_ids
from imblearn.over_sampling import SMOTE
from attributes import Attributes

In [2]:
#load and label the data
df = pd.read_csv('../data/city.csv', low_memory=False)
df['assessor_id'] = df['assessor_id'].str[1:]
df = add_labels(df)

In [3]:
clean = Preprocessing()
df = clean.transform(df)

In [4]:
df.shape

(17351, 207)

In [5]:
# Save and drop identifying info
data, identity_df = save_and_drop_ids(df)

In [7]:
data.shape

(17351, 203)

In [8]:
# Scale numerical features
cols_to_scale = Attributes().get_num_attribs()
scaler = RobustScaler()
data[cols_to_scale] = scaler.fit_transform(data[cols_to_scale])

In [10]:
# Split the data
y = data.pop('labels')
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [12]:
# Balance classes before training
sm = SMOTE(random_state=42, ratio={1:6972}, n_jobs=-1)

In [13]:
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [14]:
print(y.shape, y_train.shape, y_train_res.shape)
print(X.shape, X_train.shape, X_train_res.shape)

(17351,) (12145,) (17981,)
(17351, 202) (12145, 202) (17981, 202)


In [15]:
model = GradientBoostingClassifier(
        subsample=0.95,
        n_estimators=200,
        min_weight_fraction_leaf=0.01,
        min_samples_split=15,
        min_samples_leaf=30,
        min_impurity_decrease=0.01,
        max_leaf_nodes=10,
        max_features=15,
        max_depth=12,
        learning_rate=0.05
)

In [17]:
cv_folds = StratifiedKFold(n_splits=4, random_state=42, shuffle=False) #so I can set a seed

In [18]:
f1_score = round(cross_val_score(model, X_train_res, y_train_res, cv=cv_folds, \
scoring='f1_weighted').mean(), 2)

accuracy = round(cross_val_score(model, X_train_res, y_train_res, cv=cv_folds, \
scoring='accuracy').mean(), 2)

In [19]:
print("accuracy: {}".format(accuracy))
print("f1: {}".format(f1_score))

accuracy: 0.91
f1: 0.9


In [23]:
model.fit(X_train_res, y_train_res)
model.score(X_train_res, y_train_res)
y_pred = model.predict(X_train_res)

In [24]:
print(classification_report(y_train_res, y_pred))

             precision    recall  f1-score   support

          0       0.89      0.99      0.94     11009
          1       0.98      0.80      0.88      6972

avg / total       0.92      0.92      0.92     17981



In [29]:
tn, fp, fn, tp = confusion_matrix(y_train_res, y_pred).ravel()

In [30]:
print("TP: {}".format(tp))
print("FP: {}".format(fp))
print("FN: {}".format(fn))
print("TN: {}".format(tn))

TP: 5594
FP: 105
FN: 1378
TN: 10904


In [32]:
importances = [(score, name) for name, score in zip(X_train.columns, model.feature_importances_)]

importances.sort(key=lambda tup: tup[0])
importances.reverse()

In [33]:
print(list(importances)[0:11])

[(0.08925136982662595, 'permits'), (0.08026366265663112, 'roof_cover_type_UNKNOWN'), (0.07528485489334469, 'half_bath_count'), (0.07282404785177449, 'bedroom_count'), (0.05029752088666872, 'room_count'), (0.04967402143113683, 'owner_occupied_Owner Occupied'), (0.04912068489368925, 'full_bath_count'), (0.03737806785453802, 'owner_occupied_Standard Rental'), (0.025613505069961948, 'exterior_wall_type_FRAME WOOD/SHAKE'), (0.02499953031537345, 'exterior_wall_type_BRICK VENEER'), (0.024260196167402007, 'heating_type_HOT WATER')]
