# Preprocessing data

In [None]:
%%capture
import nbimporter
import pandas as pd
from utils import TrainingSet, DataCleaner, CategoricalData
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [None]:
df = pd.read_csv('../data/housing/housing.csv')
dt = TrainingSet(df, test_size=0.25)
Ts, Xs = dt.stratified_test(column='median_house_value', strata=5)
X, L = TrainingSet.training_labels(Ts, ['median_house_value'])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
tr_pipeline = Pipeline([
    ('cleaning', DataCleaner(text_attributes=['ocean_proximity'])),
    ('categorical', CategoricalData(text_attribute='ocean_proximity')),
    ('scaling', StandardScaler())
])

In [None]:
T = tr_pipeline.fit_transform(X)

# Fit model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from models import Predictor

In [None]:
A = Predictor(LinearRegression(), T, L.values.ravel())
B = Predictor(DecisionTreeRegressor(), T, L.values.ravel())
C = Predictor(RandomForestRegressor(), T, L.values.ravel())

In [None]:
print (A.mse(), B.mse(), C.mse())

### Cross validation

In [None]:
scoreA, scoreB, scoreC = A.cross_validate(), B.cross_validate(), C.cross_validate()

In [None]:
print (scoreA.mean(), scoreA.std())
print (scoreB.mean(), scoreB.std())
print (scoreC.mean(), scoreC.std())

# Classification

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
methods = [
    ('Gaussian NB', GaussianNB()),
    ('Support Vector Machine', SVC()),
    ('KNN', KNeighborsClassifier(n_neighbors=20))
]
kr_pipeline = Pipeline([
    ('cleaning', DataCleaner(text_attributes=None)),
    ('scaling', StandardScaler())
])

In [None]:
Ks, O = TrainingSet.training_labels(Ts, ['ocean_proximity'])
K = kr_pipeline.fit_transform(Ks)

In [None]:
predictors = []
for m, model in methods:
    predictors.append(Predictor(model, K, O.values.ravel()))

In [None]:
cms = [p.confusion_matrix() for p in predictors]
labels = list(set(O.values.ravel()))

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(12, 20))
Predictor.cm_plot(axes[0], labels, cms[0], 'Gaussian NB', fig)
Predictor.cm_plot(axes[1], labels, cms[1], 'Support Vector Machine', fig)
Predictor.cm_plot(axes[2], labels, cms[2], 'KNN', fig)
plt.tight_layout()
plt.show()