In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import classification_report #for getting CM after testing
from sklearn.ensemble import GradientBoostingClassifier
from transforms import add_labels, Preprocessing, BalanceClasses, save_and_drop_ids, DFselector
from pipeline import pipe 
from attributes import Attributes

from imblearn.over_sampling import SMOTE



In [2]:
#load and label the data
df = pd.read_csv('../data/city.csv', low_memory=False)
df['assessor_id'] = df['assessor_id'].str[1:]
df = add_labels(df)

In [3]:
clean = Preprocessing()

In [4]:
df = clean.transform(df)

In [5]:
df.shape

(17351, 207)

In [6]:
df.isnull().sum().sum() #was 32,986

0

In [7]:
# Balance classes:

In [8]:
neg = df['labels'].value_counts()[0]
neg

15720

In [9]:
pos = df['labels'].value_counts()[1]
pos

1631

In [10]:
pos / (df.shape[0])

0.09400034580139473

In [11]:
balance = BalanceClasses(method='downsample', pos_percent=0.50)

In [12]:
print(balance.pos_percent)
print(balance.method)

0.5
downsample


In [13]:
data = balance.transform(df)
# data = df

In [14]:
# df.shape
data.shape

(3262, 207)

In [15]:
pos / (data.shape[0])

0.5

In [16]:
print(balance.pos_num)
print(balance.neg_num)
print(balance.num_to_drop)

1631
15720
14089


In [17]:
# Save and drop identifying info
data, identity_df = save_and_drop_ids(df)

In [18]:
data.shape

(3262, 203)

In [19]:
#Split data

In [20]:
y = data.pop('labels')
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [21]:
print(y.shape)
print(X.shape)

(3262,)
(3262, 202)


In [22]:
# num_attribs, cat_attribs = Attributes().get_attribs()

# num_pipeline = Pipeline([
#         ('selector', DFselector(num_attribs)),
#         ('std_scaler', StandardScaler())
#     ])

# cat_pipeline = Pipeline([
#         ('selector', DFselector(cat_attribs)),
#     ])


# transform_pipeline = FeatureUnion(transformer_list=[
#         ('num_pipeline', num_pipeline),
#         ('cat_pipeline', cat_pipeline),
#     ])

In [23]:
# print(len(num_attribs))
# print(len(cat_attribs))
# total = len(num_attribs) + len(cat_attribs)
# total

In [24]:
# Fit and Score model
# model = pipe.fit(X_train, y_train)

In [25]:
cv_folds = StratifiedKFold(n_splits=4, random_state=42, shuffle=False) #so I can set a seed

f1_score = round(cross_val_score(pipe, X_train, y_train, cv=cv_folds, \
scoring='f1_weighted').mean(), 2)

accuracy = round(cross_val_score(pipe, X_train, y_train, cv=cv_folds, \
scoring='accuracy').mean(), 2)

In [26]:
f1_score

0.62

In [27]:
accuracy

0.62

In [28]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.6549199084668192

In [29]:
y_pred = pipe.predict(X_train)

In [30]:
print(classification_report(y_train, y_pred))

             precision    recall  f1-score   support

          0       0.67      0.65      0.66      1122
          1       0.64      0.66      0.65      1063

avg / total       0.66      0.65      0.65      2185



In [31]:
importances = [(score, name) for name, score in zip(X_train.columns, pipe.steps[1][1].feature_importances_)]

importances.sort(key=lambda tup: tup[0])
importances.reverse()