In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import confusion_matrix, classification_report #for getting CM after testing
from sklearn.ensemble import GradientBoostingClassifier
from transforms import add_labels, Preprocessing, BalanceClasses, save_and_drop_ids, DFselector
from pipeline import pipe 
from attributes import Attributes
from imblearn.over_sampling import SMOTE

In [2]:
#load and label the data
df = pd.read_csv('../data/city.csv', low_memory=False)
df['assessor_id'] = df['assessor_id'].str[1:]
df = add_labels(df)

In [3]:
clean = Preprocessing()

In [4]:
df = clean.transform(df)

In [5]:
df.shape

(17351, 207)

In [6]:
df.isnull().sum().sum() #was 32,986

0

In [None]:
# Balance classes:

In [None]:
# neg = df['labels'].value_counts()[0]
# neg

In [None]:
# pos = df['labels'].value_counts()[1]
# pos

In [None]:
# pos / (df.shape[0])

In [None]:
# balance = BalanceClasses(method='downsample', pos_percent=0.45)

In [None]:
# print(balance.pos_percent)
# print(balance.method)

In [None]:
# data = balance.transform(df)
# data = df

In [None]:
# df.shape
# data.shape

In [None]:
# pos / (data.shape[0])

In [None]:
# print(balance.pos_num)
# print(balance.neg_num)
# print(balance.num_to_drop)

In [7]:
# Save and drop identifying info
data, identity_df = save_and_drop_ids(df)

In [8]:
data.shape

(17351, 203)

In [9]:
#Split data

In [10]:
y = data.pop('labels')
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [11]:
# sm = SMOTE(random_state=42, ratio={1:6972}, n_jobs=-1)

In [12]:
# X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [13]:
# print(y.shape, y_train.shape, y_train_res.shape)
# print(X.shape, X_train.shape, X_train_res.shape)

(17351,) (12145,) (17955,)
(17351, 202) (12145, 202) (17955, 202)


In [None]:
# Fit and Score model
# model = pipe.fit(X_train, y_train)

In [15]:
cv_folds = StratifiedKFold(n_splits=4, random_state=42, shuffle=False) #so I can set a seed

f1_score = round(cross_val_score(pipe, X_train, y_train, cv=cv_folds, \
scoring='f1_weighted').mean(), 2)

accuracy = round(cross_val_score(pipe, X_train, y_train, cv=cv_folds, \
scoring='accuracy').mean(), 2)

In [16]:
f1_score

0.86

In [17]:
accuracy

0.9

In [18]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9716755866611775

In [19]:
y_pred = pipe.predict(X_train)

In [20]:
print(classification_report(y_train, y_pred))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98     10983
          1       1.00      0.70      0.83      1162

avg / total       0.97      0.97      0.97     12145



In [29]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()

In [32]:
print("TP: {}".format(tp))
print("FP: {}".format(fp))
print("FN: {}".format(fn))
print("TN: {}".format(tn))

TP: 819
FP: 1
FN: 343
TN: 10982


In [21]:
importances = [(score, name) for name, score in zip(X_train.columns, pipe.steps[1][1].feature_importances_)]

importances.sort(key=lambda tup: tup[0])
importances.reverse()

In [23]:
print(list(importances)[0:11])

[(0.05798878415977072, 'pv_net_zero_monthly_payments_1'), (0.057441255489871716, 'pv_heat_only_monthly_savings_1'), (0.05022658094622727, 'gas_usage_therm_improved_1'), (0.04920250085194109, 'pv_net_zero_monthly_cash_flow_1'), (0.04658566184312079, 'pv_net_zero_monthly_savings_1'), (0.04652130417088047, 'buidling_condition'), (0.044802965082558543, 'census_income_median'), (0.041387602814683684, 'pv_potential_kwhr_yr'), (0.03455686122490781, 'total_building_value'), (0.0274997567450404, 'census_average_household_size'), (0.023052441034260348, 'num_upgrades_parcel')]
