In [None]:
#from sklearn.datasets import load_boston

: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

: 

In [None]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

: 

In [None]:
housing.keys()

: 

In [None]:
print(housing.feature_names)

: 

In [None]:
print(housing.DESCR)

: 

In [None]:
print(housing.data)

: 

In [None]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)

: 

In [None]:
df['price'] = housing.target

: 

In [None]:
df.head()

: 

In [None]:
df.info()

: 

In [None]:
df.describe()

: 

: 

##Exploratory Data *Analysis*

In [None]:
df.corr()

: 

In [None]:
sns.pairplot(df)

: 

In [None]:
plt.scatter(df.MedInc, df.price)
plt.xlabel('Median Income')
plt.ylabel('Price')
plt.show()

: 

In [None]:
sns.regplot(x='MedInc', y='price', data=df)
plt.show()

: 

In [None]:
plt.scatter(df.AveRooms, df.price)
plt.xlabel('Average number of rooms')
plt.ylabel('Price')
plt.show()

: 

In [None]:
plt.scatter(df.AveBedrms, df.price)
plt.xlabel('Average number of Bedrooms')
plt.ylabel('Price')
plt.show()

: 

: 

###Prepare dataset for modelling

In [None]:
# Dependent and independent features
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

: 

In [None]:
# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

: 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

: 

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

: 

###Why standard scaling in Linear Regression?

Internally we use gradient descent, our main aim is to come to the global minimum and to achieve this we have to make sure that all our independent features should be in the same scale because convergence will happen quickly

###Model Training

In [None]:
from sklearn.linear_model import LinearRegression

: 

In [None]:
model = LinearRegression()

: 

In [None]:
model.fit(X_train, y_train)

: 

In [None]:
model.coef_

: 

In [None]:
model.intercept_

: 

In [None]:
model.get_params()

: 

In [None]:
model_pred = model.predict(X_test)

: 

In [None]:
plt.scatter(y_test, model_pred)
plt.show()

: 

In [None]:
# Residual = error in calculation
residuals = y_test-model_pred

: 

In [None]:
residuals

: 

In [None]:
sns.displot(residuals, kind='kde')

: 

In [None]:
plt.scatter(model_pred, residuals)
plt.show()

: 

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, model_pred))
print(mean_squared_error(y_test, model_pred))
print(np.sqrt(mean_squared_error(y_test, model_pred)))

: 

R square and adjusted R Square

Formula :
R^2 = 1-SSR/SST
  where R^2 = coefficient of determination, SSR = sum of square of residuals, SST = total sum of squares

Adjusted R^2 = 1-[(1-R^2)*(n-1)/(n-k-1)]
  where R^2 = R square of the model, n = the number of observations,
  k = number of predictor variable


Note : Adjusted score will always be less than R score

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, model_pred)
score

: 

In [None]:
adjusted_score = 1-((1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
adjusted_score

: 

###Predict using new data

In [None]:
housing.data[0].reshape(1,-1)

: 

In [None]:
input = scaler.transform(housing.data[0].reshape(1,-1))

: 

In [None]:
model.predict(input)

: 

In [None]:
housing.target[0]

: 

: 

###Pickling the model file for deployment

In [None]:
import pickle

: 

In [None]:
pickle.dump(model, open('regression_model.pkl','wb'))

: 

In [None]:
pickle_model = pickle.load(open('regression_model.pkl','rb'))

: 

In [None]:
pickle_model.predict(input)

: 

: 