In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from transforms import add_labels, Preprocessing, BalanceClasses, DFselector
from pipeline import pipe 
from attributes import Attributes



In [2]:
#load and label the data
df = pd.read_csv('../data/city.csv', low_memory=False)
df['assessor_id'] = df['assessor_id'].str[1:]
df = add_labels(df)

In [9]:
clean = Preprocessing()

In [10]:
df = clean.transform(df)

In [11]:
df.shape

(17351, 206)

In [12]:
df.isnull().sum().sum() #was 32,986

0

In [13]:
# Balance classes:

In [14]:
neg = df['labels'].value_counts()[0]

In [15]:
neg

15720

In [16]:
pos = df['labels'].value_counts()[1]

In [17]:
pos

1631

In [18]:
pos / (df.shape[0])

0.09400034580139473

In [19]:
balance = BalanceClasses()

In [20]:
data = balance.transform(df)

In [21]:
data.shape

(6564, 206)

In [None]:
#Split data

In [None]:
y = data.pop('labels')
X = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
print(y.shape)
print(X.shape)

In [None]:
num_attribs, cat_attribs = Attributes().get_attribs()

num_pipeline = Pipeline([
        ('selector', DFselector(num_attribs)),
        ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('selector', DFselector(cat_attribs)),
    ])


transform_pipeline = FeatureUnion(transformer_list=[
        ('num_pipeline', num_pipeline),
        ('cat_pipeline', cat_pipeline),
    ])

In [None]:
print(len(num_attribs))
print(len(cat_attribs))
total = len(num_attribs) + len(cat_attribs)
total

In [None]:
# Fit and Score model
model = pipe.fit(X_train, y_train)

In [None]:
cv_folds = 3
f1_score = round(cross_val_score(model, X_train, y_train, cv=cv_folds, \
scoring='f1_weighted').mean(), 2)

In [None]:
f1_score