In [17]:
import pandas as pd
import numpy as np
from numpy import nan
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
# Loading the California Housing dataset
data = pd.read_csv('housing.csv')

# To display the shape of input data
data.shape

(20640, 10)

In [3]:
# Top 10 samples from the California Housing Dataset
data.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [4]:
# Engineering the features (predictors) and predicted variable
X, y = data.drop('median_house_value', axis = 1) ,data.median_house_value

# Splitting the data into 80-20 ratio for training and testing respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
# Preprocessing pipeline for numerical features
numeric_features = X.columns[:-1]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing pipeline for categorical features
categorical_features = ['ocean_proximity']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combining the two preprocessing pipelines using ColumnTransfer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Steps in the main full-length pipeline (preprocessing and prediction)
steps=[
        ('preprocessor', preprocessor),
        ('lin_reg', LinearRegression())
    ]

# Defining the pipeline
lin_reg_full_pipeline = Pipeline(steps)

# Training on the data set
lin_reg_full_pipeline.fit(X_train, y_train)

# Model accuracy score
print("Model Score is: %.3f" % lin_reg_full_pipeline.score(X_test, y_test))

# use named_steps[] to access any step in your pipeline
print(lin_reg_full_pipeline.named_steps['lin_reg'].intercept_, lin_reg_full_pipeline.named_steps['lin_reg'].coef_)

Model Score is: 0.638
244072.0378847737 [-53826.84416173 -54475.14057202  13518.37781429  -9287.00275798
  28022.40204288 -44247.49335374  30178.8307068   73870.04268374
 -24863.73331799 -63516.12050232 136847.70499224 -29182.1019669
 -19285.74920503]


In [16]:
# Calculations for RMSE value 
y_pred = lin_reg_full_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

#### The RMSE value for this regression model is {{round(np.sqrt(mse), 3)}}. It shows the standard deviation of the residuals, which are the measure of how far the actual data points are from the predicted regression line.

In [31]:
sample_X = pd.read_csv('sample.csv', header=0)
sample_y = 485600
sample_pred = lin_reg_full_pipeline.predict(sample_X)
error = abs(sample_y-sample_pred[0])

#### The error in the above predcition is approxiamately {{round(error.astype(int), 3)}}.