### Basic EDA
- splitting datasets into train/test
- using KNeighborsRegressor
- using Mean Squared Error to compare estimated results to test results
- normalize data using StandardScaler() and RobustScaler()
- create new features

In [2]:
# Building New Features
import numpy as np

# https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/datasets
from sklearn import datasets 

# used to split datasets for machine learning
# https://scikit-learn.org/stable/modules/cross_validation.html
from sklearn.cross_validation import train_test_split 

# The MSE is a measure of the quality of an estimator
# https://en.wikipedia.org/wiki/Mean_squared_error
from sklearn.metrics import mean_squared_error

In [3]:
cali = datasets.california_housing.fetch_california_housing()
cali

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to C:\Users\Nate\scikit_learn_data


{'DESCR': 'California housing dataset.\n\nThe original database is available from StatLib\n\n    http://lib.stat.cmu.edu/datasets/\n\nThe data contains 20,640 observations on 9 variables.\n\nThis dataset contains the average house value as target variable\nand the following input variables (features): average income,\nhousing average age, average rooms, average bedrooms, population,\naverage occupation, latitude, and longitude in that order.\n\nReferences\n----------\n\nPace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,\nStatistics and Probability Letters, 33 (1997) 291-297.\n\n',
 'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.     

In [5]:
X = cali['data']
Y = cali['target']

In [6]:
# split cali dataset X,Y into train & test datasets
# train size of 0.8 means we're using 0.8 of the datasets
# to train and the rest to test our algorithm
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)

In [7]:
# https://scikit-learn.org/stable/modules/neighbors.html
from sklearn.neighbors import KNeighborsRegressor

In [8]:
regressor = KNeighborsRegressor()

In [9]:
# build KNN Regressor model here
regressor.fit(X_train, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [10]:
# use model to predict test values
Y_est = regressor.predict(X_test)

In [11]:
print("MAE = ", mean_squared_error(Y_test, Y_est))

MAE =  1.1151279812969894


In [23]:
# use train_test_split to split dataset
# pass in X_train and Y_train to fit the regressor
# use regressor with X_test to estimate/predict Y_est
# use MSE to compare Y_est to Y_test

def KN_Predict(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)
    regressor = KNeighborsRegressor()
    regressor.fit(X_train, Y_train)
    Y_est = regressor.predict(X_test)
    return mean_squared_error(Y_test, Y_est)

In [27]:
MAE = [] # list of MAE values
n = 10 # iterations

# run the model n number of times
for i in range(0, n):
    MAE.append(KN_Predict(cali['data'], cali['target']))
    
np.mean(MAE)

1.1159417094115245

In [28]:
# normalize X values using z-normalization
# https://scikit-learn.org/stable/modules/preprocessing.html
from sklearn.preprocessing import StandardScaler

In [29]:
scaler = StandardScaler()

In [30]:
# use train_test_split to split dataset
# scale X data using StandardScaler()
# pass in X_train and Y_train to fit the regressor
# use regressor with X_test to estimate/predict Y_est
# use MSE to compare Y_est to Y_test

def KN_Predict_Scaled(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    regressor = KNeighborsRegressor()
    regressor.fit(X_train_scaled, Y_train)
    Y_est = regressor.predict(X_test_scaled)
    return mean_squared_error(Y_test, Y_est)

In [31]:
MAE = [] # list of MAE values
n = 10 # iterations

# run the model n number of times
for i in range(0, n):
    MAE.append(KN_Predict_Scaled(cali['data'], cali['target']))
    
np.mean(MAE)

0.41477224514685007

In [32]:
# MAE is the square of how different the estimates are
# relative to the actuals, basically how good your
# ML model predicts the data

In [33]:
from sklearn.preprocessing import RobustScaler

In [35]:
# RobustScaler is better for datasets that have large outliers
# how do we determine if the dataset has large outliers systematically?
# generally try both scalers and see which one has better (lower) MSE
scaler2 = RobustScaler()

In [36]:
# use train_test_split to split dataset
# scale X data using RobustScaler()
# pass in X_train and Y_train to fit the regressor
# use regressor with X_test to estimate/predict Y_est
# use MSE to compare Y_est to Y_test

def KN_Predict_RobustScaled(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)
    X_train_scaled = scaler2.fit_transform(X_train)
    X_test_scaled = scaler2.transform(X_test)
    regressor = KNeighborsRegressor()
    regressor.fit(X_train_scaled, Y_train)
    Y_est = regressor.predict(X_test_scaled)
    return mean_squared_error(Y_test, Y_est)

In [37]:
MAE = [] # list of MAE values
n = 10 # iterations

# run the model n number of times
for i in range(0, n):
    MAE.append(KN_Predict_RobustScaled(cali['data'], cali['target']))
    
np.mean(MAE)

0.4229417948807613

In [59]:
non_linear_feat = 5 # AveOccup

# new feature is the square root of the 5th feature - AveOccup
X_train_new_feat = np.sqrt(X_train[:,non_linear_feat]) # all rows, 5th column

# change shape of new feature
X_train_new_feat.shape = (X_train_new_feat.shape[0], 1)

# add new feature to the X_train ndarray using horizontal stacking
# this works as long as arrays are same number of rows
X_train_extended = np.hstack([X_train, X_train_new_feat])

# do same steps for X_test
X_test_new_feat = np.sqrt(X_test[:,non_linear_feat])
X_test_new_feat.shape = (X_test_new_feat.shape[0], 1)
X_test_extended = np.hstack([X_test, X_test_new_feat])

In [60]:
scaler = StandardScaler()

In [61]:
X_train_extended_scaled = scaler.fit_transform(X_train_extended)
X_test_extended_scaled = scaler.transform(X_test_extended)

In [62]:
regressor = KNeighborsRegressor()

In [63]:
regressor.fit(X_train_extended_scaled, Y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [64]:
Y_est = regressor.predict(X_test_extended_scaled)
print("MAE = ", mean_squared_error(Y_test, Y_est))

MAE =  0.3504576041648188
