# The Data Pipeline

In [None]:
import pandas as pd
iris_filename = 'irisdata.csv'
iris = pd.read_csv(iris_filename, header=None, 
        names= ['sepal_length', 'sepal_width', 
            'petal_length', 'petal_width', 'target'])
iris.head()

In [None]:
iris.describe()

In [None]:
iris.boxplot(return_type="axes")
#boxplot gives visual idea about the distribution of each feature.

In [None]:
iris.quantile([0.1, 0.9])

In [None]:
iris.target.unique()

In [None]:
#count the number of times the petal_length feature appears more
#than the average against the same count for the petal_width feature(Similarity matrix)
pd.crosstab(iris['petal_length'] > 3.758667, 
            iris['petal_width'] > 1.198667)

In [None]:
scatterplot = iris.plot(kind='scatter', 
                            x='petal_width', y='petal_length', 
                            s=64, c='blue', edgecolors='white') #s = point size

In [None]:
#distribution of feature using histogram
distr = iris.petal_width.plot(kind='hist', bins=20)

## Building new features

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
cali = datasets.fetch_california_housing()
X = cali['data']
Y = cali['target']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size=0.2)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()
regressor.fit(X_train, Y_train)
Y_est = regressor.predict(X_test)
print ("MAE =", mean_squared_error(Y_test, Y_est))

Here the mean normalization is 1.08, we will use Z-scores to normalize input features and compare the regression 
tasks on this new feature set. Z-normalization is simply the mapping of each feature to a new one with a null mean and unitary variance.


In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)
regressor = KNeighborsRegressor()
regressor.fit(X_train_scaled, Y_train)
Y_estimate = regressor.predict(X_test_scaled)
print ("MAE =", mean_squared_error(Y_test, Y_estimate))
#Note that we didn't use the original features; 
#we used their linear modification, which is more suitable for learning with a KNN regressor.

RobustScaler is more robust to outliers. It uses median and interquartile range (IQR) 
instead of mean and standard deviation.

In [None]:
from sklearn.preprocessing import RobustScaler
scaler2 = RobustScaler()
X_train_scaled = scaler2.fit_transform(X_train)
X_test_scaled = scaler2.transform(X_test)
regressor = KNeighborsRegressor()
regressor.fit(X_train_scaled, Y_train)
Y_est = regressor.predict(X_test_scaled)
print ("MAE=", mean_squared_error(Y_test, Y_est))

Nonlinear Transformation using AveOccup Feature

In [None]:
non_linear_feat = 5 # AveOccup 
X_train_new_feat = np.sqrt(X_train[:,non_linear_feat])
X_train_new_feat.shape = (X_train_new_feat.shape[0], 1)
X_train_extended = np.hstack([X_train, X_train_new_feat])
X_test_new_feat = np.sqrt(X_test[:,non_linear_feat])
X_test_new_feat.shape = (X_test_new_feat.shape[0], 1)
X_test_extended = np.hstack([X_test, X_test_new_feat])
scaler = StandardScaler()
X_train_extended_scaled = scaler.fit_transform(X_train_extended)
X_test_extended_scaled = scaler.transform(X_test_extended)
regressor = KNeighborsRegressor()
regressor.fit(X_train_extended_scaled, Y_train)
Y_est = regressor.predict(X_test_extended_scaled)
print ("MAE=", mean_squared_error(Y_test, Y_est)) 