# Linear Regression Practical Implementaion

In [81]:
from sklearn.datasets import fetch_california_housing

In [82]:
dataset = fetch_california_housing()

In [83]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [84]:
dataset.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [85]:
print(dataset.target)

[4.526 3.585 3.521 ... 0.923 0.847 0.894]


In [86]:
print(dataset.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [87]:
import pandas as pd

In [88]:
df = pd.DataFrame(dataset.data,columns=dataset.feature_names)

In [89]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [90]:
df['Price'] = dataset.target

In [91]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [92]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [93]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Price         0
dtype: int64

In [94]:
import seaborn as sns

In [95]:
#sns.pairplot(df)


In [96]:
df_copy = df.sample(frac=0.25)

In [97]:
df_copy.shape

(5160, 9)

In [98]:
sns.pairplot(df_copy)

<seaborn.axisgrid.PairGrid at 0x7fccb6ed2970>

In [134]:
# Divide the Dataset into independent and dependent feature

# 1. Divide the Dataset into train and test

# 2. Feature Scaling-- Standradazation

# 3. Model Training

# 4. Model Fit

# 5. Coefficient and intercept

# 6. Prediction

# 7. MSE, MAE, RMSE







In [116]:
# Divide the Dataset into independent and dependent feature

X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [117]:
## Divide the Dataset into train and test


from sklearn.model_selection import  train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)

In [118]:
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
5088,0.9809,19.0,3.187726,1.129964,726.0,2.620939,33.98,-118.28
17096,4.2232,33.0,6.189696,1.086651,1015.0,2.377049,37.46,-122.23
5617,3.5488,42.0,4.821577,1.095436,1044.0,4.33195,33.79,-118.26
20060,1.6469,24.0,4.274194,1.048387,1686.0,4.532258,35.87,-119.26
895,3.9909,14.0,4.608303,1.08935,2738.0,2.471119,37.54,-121.96


In [119]:
X.shape

(20640, 8)

In [120]:
X_train.shape,X_test.shape

((13828, 8), (6812, 8))

In [121]:
from sklearn.preprocessing import StandardScaler

In [122]:
scaler = StandardScaler()

In [123]:
scaler.fit(X_train)

StandardScaler()

In [124]:
# scaler.transform(X_train) or
X_train = scaler.fit_transform(X_train)

In [125]:
X_train

array([[-1.52169954, -0.75868455, -0.91813082, ..., -0.04014979,
        -0.78315293,  0.65090402],
       [ 0.18205382,  0.3523991 ,  0.30748451, ..., -0.05943619,
         0.8475975 , -1.31911678],
       [-0.17232772,  1.06666717, -0.25107784, ...,  0.09515433,
        -0.87218816,  0.66087881],
       ...,
       [-0.49518065,  0.59048846, -0.59192622, ...,  0.01608523,
        -0.75972261,  0.60103008],
       [ 0.96816708, -1.07613702,  0.39144665, ...,  0.0039722 ,
         0.90383028, -1.18445713],
       [-0.68377427,  1.86029835, -0.83041358, ..., -0.08065717,
         0.9928655 , -1.41387727]])

In [126]:
X_test = scaler.transform(X_test)

In [128]:
from sklearn.linear_model import LinearRegression

In [129]:
regression = LinearRegression()

In [130]:
regression.fit(X_train,Y_train)

LinearRegression()

In [131]:
regression.coef_

array([ 8.46603472e-01,  1.20333548e-01, -2.98800785e-01,  3.47243173e-01,
       -8.81413334e-05, -4.17242067e-02, -8.94420371e-01, -8.70401317e-01])

In [132]:
regression.intercept_

2.0666362048018536

In [135]:
## prediction
Y_pred= regression.predict(X_test)

In [136]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [139]:
import numpy as np
mse = mean_squared_error(Y_test,Y_pred)
print(mse)
mae = mean_absolute_error(Y_test,Y_pred)
print(mae)
print(np.sqrt(mse))

0.536968654337246
0.5295710106684454
0.7327814505957735


In [142]:
## Accuracy r2 and adjusted r square
from sklearn.metrics import r2_score

In [148]:
score = r2_score(Y_test,Y_pred)

In [149]:
score

0.5970494128783952

In [147]:
#display adjusted R-squared
1 - (1-score)*(len(Y)-1)/(len(Y)-X.shape[1]-1)

0.5968931623477872

In [155]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=10)

ridge.fit(X_train,Y_train)

Ridge(alpha=10)

In [156]:
Y_pred = ridge.predict(X_test)


In [157]:
mse = mean_squared_error(Y_test,Y_pred)
print(mse)
mae = mean_absolute_error(Y_test,Y_pred)
print(mae)
print(np.sqrt(mse))

0.536752799648305
0.5295394602988547
0.7326341512981122


In [158]:
from sklearn.linear_model import Lasso

In [159]:
lasso = Lasso(alpha=10)

lasso.fit(X_train,Y_train)

Lasso(alpha=10)

In [160]:
Y_pred = lasso.predict(X_test)


In [161]:
mse = mean_squared_error(Y_test,Y_pred)
print(mse)
mae = mean_absolute_error(Y_test,Y_pred)
print(mae)
print(np.sqrt(mse))

1.3326257277946882
0.9126511897647483
1.15439409553007


In [162]:
from sklearn.linear_model import ElasticNet

In [164]:
elasticNet = ElasticNet(alpha=10)

elasticNet.fit(X_train,Y_train)

ElasticNet(alpha=10)

In [165]:
Y_pred = elasticNet.predict(X_test)

In [166]:
mse = mean_squared_error(Y_test,Y_pred)
print(mse)
mae = mean_absolute_error(Y_test,Y_pred)
print(mae)
print(np.sqrt(mse))

1.3326257277946882
0.9126511897647483
1.15439409553007
