In [3]:
from sklearn.datasets import fetch_california_housing
import pandas as pd


data = fetch_california_housing()
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']

display(X.head())
display(y)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=0.2)
X_train.shape, X_test.shape

((16512, 8), (4128, 8))

In [5]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

In [6]:

knn.fit(X_train, y_train)
from sklearn.metrics import r2_score
pred_train = knn.predict(X_train)
pred_test = knn.predict(X_test)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.45
Test R2  score 0.18


In [7]:

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [8]:

pred_train = lr.predict(X_train)
pred_test = lr.predict(X_test)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.61
Test R2  score 0.60


In [19]:

from sklearn.ensemble import  RandomForestRegressor

tree= RandomForestRegressor(random_state=1, max_depth=6)
tree.fit(X_train, y_train)
pred_train = tree.predict(X_train)
pred_test = tree.predict(X_test)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')


Train  R2 score 0.72
Test R2  score 0.71


In [21]:

import numpy as np
np.set_printoptions(suppress=True)

### Normalization


$$x = \frac{x - min}{max - min}$$


After this transformation, $min = 0, max = 1$.

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaller = MinMaxScaler()
scaller.fit_transform(X_train)

X_train_norm = pd.DataFrame(scaller.fit_transform(X_train),
                            columns=X_train.columns)
X_train_norm.describe()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,0.231956,0.541389,0.032506,0.022676,0.039882,0.003897,0.329404,0.475445
std,0.130436,0.247553,0.018406,0.014861,0.031845,0.010724,0.227318,0.199838
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.142019,0.333333,0.025438,0.019949,0.021918,0.0029,0.147715,0.253984
50%,0.209138,0.54902,0.031022,0.02121,0.032582,0.003548,0.182784,0.582669
75%,0.292396,0.705882,0.036928,0.022704,0.048404,0.004322,0.550478,0.631474
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:

X_test_norm = pd.DataFrame(scaller.transform(X_test), columns=X_train.columns)
X_test_norm.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0
mean,0.234496,0.544198,0.032414,0.02244,0.039816,0.004262,0.325244,0.478847
std,0.13333,0.243659,0.013532,0.010172,0.031321,0.032298,0.225655,0.198421
min,0.0,0.0,0.002015,0.003294,5.6e-05,0.000465,0.002125,0.00498
25%,0.143938,0.333333,0.025715,0.01991,0.022142,0.002904,0.148778,0.258715
50%,0.21039,0.54902,0.03124,0.021195,0.03268,0.003552,0.180659,0.584661
75%,0.294279,0.705882,0.036861,0.022743,0.047619,0.004325,0.548353,0.631474
max,1.0,1.0,0.436515,0.408432,0.451778,2.07445,0.98831,0.982072


In [29]:
knn.fit(X_train_norm, y_train)
pred_train = knn.predict(X_train_norm)
pred_test = knn.predict(X_test_norm)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.81
Test R2  score 0.70


In [31]:
lr.fit(X_train_norm, y_train)
pred_train = lr.predict(X_train_norm)
pred_test = lr.predict(X_test_norm)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.61
Test R2  score 0.60


In [32]:

from sklearn.ensemble import  RandomForestRegressor

tree.fit(X_train_norm, y_train)
pred_train = tree.predict(X_train_norm)
pred_test = tree.predict(X_test_norm)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.72
Test R2  score 0.71


### Standardization

$$x = \frac{x - mean}{std}$$
 After this transformation $mean = 0, std = 1$.

In [33]:
from sklearn.preprocessing import StandardScaler

scaler_1 = StandardScaler()
scaler_1.fit(X_train)

In [38]:

pd.set_option('display.float_format', lambda x: '%0.4f' % x)

In [39]:
X_train_std = pd.DataFrame(scaler_1.transform(X_train),
                           columns=X_train.columns)
X_train_std.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.7784,-2.187,-1.7662,-1.526,-1.2524,-0.3634,-1.4491,-2.3792
25%,-0.6895,-0.8405,-0.384,-0.1835,-0.5641,-0.0931,-0.7993,-1.1082
50%,-0.1749,0.0308,-0.0807,-0.0986,-0.2292,-0.0326,-0.645,0.5366
75%,0.4634,0.6645,0.2402,0.0019,0.2676,0.0396,0.9726,0.7808
max,5.8885,1.8526,52.5662,65.7683,30.1505,92.8858,2.9501,2.625


In [42]:
X_test_std = pd.DataFrame(scaler_1.transform(X_test),
                           columns=X_train.columns)
X_test_std.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0
mean,0.0195,0.0113,-0.005,-0.0159,-0.0021,0.034,-0.0183,0.017
std,1.0222,0.9843,0.7352,0.6845,0.9836,3.0118,0.9927,0.9929
min,-1.7784,-2.187,-1.6567,-1.3043,-1.2506,-0.3201,-1.4398,-2.3543
25%,-0.6748,-0.8405,-0.369,-0.1861,-0.5571,-0.0926,-0.7946,-1.0846
50%,-0.1653,0.0308,-0.0688,-0.0997,-0.2262,-0.0322,-0.6544,0.5465
75%,0.4778,0.6645,0.2366,0.0045,0.243,0.0398,0.9632,0.7808
max,5.8885,1.8526,21.9507,25.9592,12.9348,193.0775,2.8987,2.5353


In [43]:
knn.fit(X_train_std, y_train)
pred_train = knn.predict(X_train_std)
pred_test = knn.predict(X_test_std)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.81
Test R2  score 0.70


In [44]:
lr.fit(X_train_std, y_train)
pred_train = lr.predict(X_train_std)
pred_test = lr.predict(X_test_std)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.61
Test R2  score 0.60


In [45]:
from sklearn.ensemble import  RandomForestRegressor

tree.fit(X_train_std, y_train)
pred_train = tree.predict(X_train_std)
pred_test = tree.predict(X_test_std)

print(f'Train  R2 score {r2_score(y_train, pred_train):.2f}')
print(f'Test R2  score {r2_score(y_test, pred_test):.2f}')

Train  R2 score 0.72
Test R2  score 0.71
