In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from transforms import add_labels, Preprocessing, BalanceClasses, DFselector
from pipeline import pipe 
from attributes import Attributes

In [30]:
#load and label the data
df = pd.read_csv('../data/city.csv', low_memory=False)
df['assessor_id'] = df['assessor_id'].str[1:]
df = add_labels(df)

In [31]:
df.shape

(18406, 239)

In [5]:
clean = Preprocessing()

In [6]:
df = clean.transform(df)

In [7]:
df.shape

(17351, 207)

In [8]:
df.isnull().sum().sum() #was 32,986

0

In [9]:
# Balance classes:

In [10]:
neg = df['labels'].value_counts()[0]

In [11]:
pos = df['labels'].value_counts()[1]

In [12]:
pos

1631

In [13]:
pos / (df.shape[0])

0.09400034580139473

In [14]:
balance = BalanceClasses()

In [15]:
data = balance.transform(df)

In [16]:
data.shape

(6564, 207)

In [17]:
#Split data

In [18]:
y = data.pop('labels')
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [19]:
print(y.shape)
print(X.shape)

(6564,)
(6564, 206)


In [22]:
num_attribs, cat_attribs = Attributes().get_attribs()

num_pipeline = Pipeline([
        ('selector', DFselector(num_attribs)),
        ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('selector', DFselector(cat_attribs)),
    ])


transform_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline),
    ])

In [25]:
print(len(num_attribs))
print(len(cat_attribs))
total = len(num_attribs) + len(cat_attribs)
total

61
145


206

In [27]:
# Fit and Score model
model = pipe.fit(X_train, y_train)

In [32]:
cv_folds = 3
f1_weighted = round(cross_val_score(model, X_train, y_train, cv=cv_folds, \
scoring='f1_weighted').mean(), 2)

In [33]:
f1_weighted

0.68