# Data preprocessing

In [1]:
import pandas
import numpy as np
import sklearn.linear_model as lm
from sklearn.model_selection import KFold
from sklearn import preprocessing as pre
import random
import matplotlib.pyplot as plt



## Reading data

In [2]:
housing = pandas.read_csv('./housing.csv')

## Transform ocean_proximity column to 5 binary-valued columns

In [3]:
housing['1h_ocean'] = [1 if i=='<1H OCEAN' else 0 for i in housing.ocean_proximity.values]
housing['island'] = [1 if i=='ISLAND' else 0 for i in housing.ocean_proximity.values]
housing['inland'] = [1 if i=='INLAND' else 0 for i in housing.ocean_proximity.values]
housing['near_ocean'] = [1 if i=='NEAR OCEAN' else 0 for i in housing.ocean_proximity.values]
housing['near_bay'] = [1 if i=='NEAR BAY' else 0 for i in housing.ocean_proximity.values]
housing.drop(columns=['ocean_proximity'], inplace=True)

## Filling missing data

Missing total_bedroom values are predicted through linear regression with total_rooms values

In [4]:
notna = housing.total_bedrooms.notna()
model = lm.LinearRegression()
model.fit(housing.total_rooms.values[notna].reshape(-1,1), housing.total_bedrooms.values[notna].reshape(-1,1))
isna = housing.total_bedrooms.isna()
missing_bedrooms = model.predict(housing.total_rooms.values[isna].reshape(-1,1))
housing.total_bedrooms.loc[isna] = np.squeeze(missing_bedrooms)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## The code below is for feature scaling (normalisation):

In [None]:
import pandas as pd
from sklearn import preprocessing
import xlsxwriter

# need to make sure there are only numbers and that there are no Nan/blank cells in the data 
# before running this code
df = pd.read_excel('housing.xls')

standard = preprocessing.StandardScaler().fit(df)
df = standard.transform(df)

workbook = xlsxwriter.Workbook('housing_normalised.xlsx')
worksheet = workbook.add_worksheet()

row = 0
col = 0

for a,b,c,d,e,f,g,h,i,j in (df):
    worksheet.write(row, col,     a)
    worksheet.write(row, col + 1, b)
    worksheet.write(row, col + 2, c)
    worksheet.write(row, col + 3, d)
    worksheet.write(row, col + 4, e)
    worksheet.write(row, col + 5, f)
    worksheet.write(row, col + 6, g)
    worksheet.write(row, col + 7, h)
    worksheet.write(row, col + 8, i)
    worksheet.write(row, col + 8, j)
    row += 1
    
workbook.close()

In [None]:
import pandas as pd
from sklearn import preprocessing

stand = preprocessing.StandardScaler().fit(housing)
housing = stand.transform(housing)

## Data Transform using Box Cox Transform

In [5]:
from scipy import stats

median_house_value_bc, maxlog, interval = stats.boxcox(housing.median_house_value, alpha=0.05)
population_bc, maxlog, interval = stats.boxcox(housing.population, alpha=0.05)
housing_median_age_bc, maxlog, interval = stats.boxcox(housing.housing_median_age, alpha=0.05)
total_rooms_bc, maxlog, interval = stats.boxcox(housing.total_rooms, alpha=0.05)
total_bedrooms_bc, maxlog, interval = stats.boxcox(housing.total_bedrooms, alpha=0.05)
households_bc, maxlog, interval = stats.boxcox(housing.households, alpha=0.05)
median_income_bc, maxlog, interval = stats.boxcox(housing.median_income, alpha=0.05)



In [6]:
housing_boxcox = housing.copy()

housing_boxcox.drop(columns=['housing_median_age'], inplace=True)
housing_boxcox.drop(columns=['total_rooms'], inplace=True)
housing_boxcox.drop(columns=['total_bedrooms'], inplace=True)
housing_boxcox.drop(columns=['population'], inplace=True)
housing_boxcox.drop(columns=['households'], inplace=True)
housing_boxcox.drop(columns=['median_income'], inplace=True)
housing_boxcox.drop(columns=['median_house_value'], inplace=True)

housing_boxcox['housing_median_age'] = housing_median_age_bc
housing_boxcox['total_rooms'] = total_rooms_bc
housing_boxcox['total_bedrooms'] = total_bedrooms_bc
housing_boxcox['population'] = population_bc
housing_boxcox['households'] = households_bc
housing_boxcox['median_income'] = median_income_bc
housing_boxcox['median_house_value'] = median_house_value_bc


print(housing_boxcox)

       longitude  latitude  1h_ocean  island  inland  near_ocean  near_bay  \
0        -122.23     37.88         0       0       0           0         1   
1        -122.22     37.86         0       0       0           0         1   
2        -122.24     37.85         0       0       0           0         1   
3        -122.25     37.85         0       0       0           0         1   
4        -122.25     37.85         0       0       0           0         1   
5        -122.25     37.85         0       0       0           0         1   
6        -122.25     37.84         0       0       0           0         1   
7        -122.25     37.84         0       0       0           0         1   
8        -122.26     37.84         0       0       0           0         1   
9        -122.25     37.84         0       0       0           0         1   
10       -122.26     37.85         0       0       0           0         1   
11       -122.26     37.85         0       0       0           0

 ## Support Vector Machines
 


In [8]:
import support_vector_machines

cross_set, eval_set = support_vector_machines.create_kfold_sets(housing_boxcox)
support_vector_machines.run_svr(cross_set, eval_set,c=6)

[training r2, test r2]
0 : 0.7508149377936468
1 : 0.7438979667070394
2 : 0.7510536760700761
3 : 0.7408688803643437
4 : 0.7501359404088919
5 : 0.7465051453183426
6 : 0.7489655963056718
7 : 0.7514399953343291
8 : 0.749475695590093
9 : 0.749653671807225
test MAE, test MSE
0 : 0.9586188043791026,1.6752601177482223
1 : 0.9634834000891607,1.6993542342855779
2 : 0.9576384950464599,1.6891384884718317
3 : 0.9337602558871324,1.6134124775499739
4 : 0.9470687588055129,1.6430926819686194
Eval r2, Eval MSA, Eval MSE
0.7581331900068221


In [9]:
hous = support_vector_machines.load_data()

cs,es = support_vector_machines.create_kfold_sets(hous)

support_vector_machines.run_svr(cs,es,c=6)

[training r2, test r2]
0 : -0.055364972907117505
1 : -0.04800186540654128
2 : -0.053124526563827386
3 : -0.06371852891877605
4 : -0.0543719291364424
5 : -0.04396568885592211
6 : -0.05348698337892244
7 : -0.04721521881018176
8 : -0.05322854925319942
9 : -0.06875332045065874
test MAE, test MSE
0 : 87377.4445138612,13805305922.035011
1 : 89312.22159218843,14282486044.076712
2 : 85858.63919479414,13306650175.970669
3 : 87788.54895871457,13873326687.873493
4 : 91340.77572361121,14962880141.416927
Eval r2, Eval MSA, Eval MSE
-0.05434773719798991


In [10]:
support_vector_machines.run_svr(cs,es,c=7)

AttributeError: module 'support_vector_machines' has no attribute 'run_scr'

## After data transform, please use housing_boxcox as input

In [None]:
# First, extract the data into arrays
y = housing_boxcox.median_house_value.values.reshape(-1,1)
X = housing_boxcox.drop(columns=['median_house_value'], inplace=False).values

print(X.shape)
print(y.shape)
# Pull out 1000 values into a holdout set
holdout = random.sample(range(0,10640),1000)
X_holdout = X[holdout]
y_holdout = y[holdout]
Xt = np.delete(X, holdout, 0)
yt = np.delete(y, holdout, 0)
print(Xt.shape)
print(yt.shape)

train_R2_average = 0
test_R2_average = 0
mae_average = 0
accuracy_average = 0

fold_number = 10

# Have to shuffle the data because it is grouped.
kf = KFold(n_splits=fold_number, shuffle=True)
for train_index, test_index in kf.split(Xt):
  X_train, X_test = Xt[train_index], Xt[test_index]
  y_train, y_test = yt[train_index], yt[test_index]
  Model.fit(X_train, y_train)
  predictions = Model.predict(X_test)
  errors = abs(predictions - y_test)
  mae = round(numpy.mean(errors), 2)
  mae_average += mae/fold_number
  print('Mean Absolute Error:', mae)
  mape = 100 * (errors / y_test)
  accuracy = 100 - numpy.mean(mape)
  accuracy_average += accuracy/fold_number
  print('Accuracy:', round(accuracy, 2), '%.')
  print('Training R^2: ' , Model.score(X_train, y_train))
  print('Testing R^2: ' , Model.score(X_test, y_test))
  print()
  train_R2_average += Model.score(X_train, y_train)/fold_number
  test_R2_average += Model.score(X_test, y_test)/fold_number
print('Average train accuracy: ', accuracy_average)
print('Average train mae: ', mae_average)
print('Average train R^2: ', train_R2_average)
print('Average test R^2: ', test_R2_average)