In [1]:
from sklearn.datasets import fetch_california_housing

## Flow
1. Divide the dataset into dependent and independent feature
2. Divide dataset into train and test
3. Feature scaling - standardization(z-score)
4. Model Train
5. Model fit
6. Coefficients and Intercept
7. Prediction
8. MSE, MAE, RMSE
9. R2 and Adjusted R2

In [2]:
dataset = fetch_california_housing()

In [3]:
import pandas as pd

In [4]:
df= pd.DataFrame(dataset.data, columns = dataset.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
df['price'] = dataset.target

In [6]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:
#descriptive analysis
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [8]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
price         0
dtype: int64

In [9]:
import seaborn as sns

In [10]:
#sns.pairplot(df)

In [11]:
# Divide the dataset into independent and dependent
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [12]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state = 42)

In [13]:
X.shape

(20640, 8)

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [15]:
# scaler.fit - it is calulation standard deviation & Mean
# scaler.transform - it is applying z score

X_train = scaler.fit_transform(X_train)


In [16]:
X_train

array([[-1.52169954, -0.75868455, -0.91813082, ..., -0.04014979,
        -0.78315293,  0.65090402],
       [ 0.18205382,  0.3523991 ,  0.30748451, ..., -0.05943619,
         0.8475975 , -1.31911678],
       [-0.17232772,  1.06666717, -0.25107784, ...,  0.09515433,
        -0.87218816,  0.66087881],
       ...,
       [-0.49518065,  0.59048846, -0.59192622, ...,  0.01608523,
        -0.75972261,  0.60103008],
       [ 0.96816708, -1.07613702,  0.39144665, ...,  0.0039722 ,
         0.90383028, -1.18445713],
       [-0.68377427,  1.86029835, -0.83041358, ..., -0.08065717,
         0.9928655 , -1.41387727]])

In [17]:
X_test = scaler.transform(X_test)

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
regression = LinearRegression()

In [20]:
regression.fit(X_train,y_train)

In [21]:
regression.coef_

array([ 8.46603472e-01,  1.20333548e-01, -2.98800785e-01,  3.47243173e-01,
       -8.81413334e-05, -4.17242067e-02, -8.94420371e-01, -8.70401317e-01])

In [22]:
regression.intercept_

2.0666362048018536

In [23]:
## Predict
y_pred = regression.predict(X_test)


In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [25]:
import numpy as np
mse = mean_squared_error(y_test,y_pred)
print(mse)
mae = mean_absolute_error(y_test, y_pred)
print(mae)
print(np.sqrt(mse))

0.5369686543372458
0.5295710106684453
0.7327814505957734


In [26]:
## Accuracy r squared and adjusted r square

from sklearn.metrics import r2_score

score = r2_score(y_test,y_pred)
score

0.5970494128783954

In [27]:
#Adjusted r squared error

1 - (1-score)*(len(y)-1)/(len(y)-X.shape[1]-1)

0.5968931623477876

In [41]:
# Ridge Regression

from sklearn.linear_model import Ridge

In [42]:
ridge = Ridge(alpha = 20.0)
ridge.fit(X_train,y_train)


In [43]:
y_pred = ridge.predict(X_test)

In [44]:
import numpy as np
mse = mean_squared_error(y_test,y_pred)
print(mse)
mae = mean_absolute_error(y_test, y_pred)
print(mae)
print(np.sqrt(mse))

0.5365662547609757
0.5295269970529024
0.732506829156545


In [45]:
# Lasso 
from sklearn.linear_model import Lasso

In [48]:
lasso = Lasso(alpha = 20.0)
lasso.fit(X_train,y_train)
y_pred = lasso.predict(X_test)

In [49]:
import numpy as np
mse = mean_squared_error(y_test,y_pred)
print(mse)
mae = mean_absolute_error(y_test, y_pred)
print(mae)
print(np.sqrt(mse))

1.3326257277946882
0.9126511897647483
1.15439409553007


In [50]:
# Elastic

from sklearn.linear_model import ElasticNet

In [52]:
elastic = ElasticNet(alpha = 20.0)
elastic.fit(X_train, y_train)
y_pred = elastic.predict(X_test)

In [53]:
import numpy as np
mse = mean_squared_error(y_test,y_pred)
print(mse)
mae = mean_absolute_error(y_test, y_pred)
print(mae)
print(np.sqrt(mse))

1.3326257277946882
0.9126511897647483
1.15439409553007
