In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Lets load the Boston House Pricing Dataset

In [3]:
from sklearn.datasets import load_boston

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [None]:
boston=load_boston()

In [None]:
boston.keys()

In [None]:
## Lets check the description of the dataset
print(boston.DESCR)

In [None]:
print(boston.data)

In [None]:
print(boston.target)

In [None]:
print(boston.feature_names)

## Preparing The Dataset

In [None]:
dataset=pd.DataFrame(boston.data,columns=boston.feature_names)

In [None]:
dataset.head()

In [None]:
dataset['Price']=boston.target

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
## Summarizing The Stats of the data
dataset.describe()

In [None]:
## Check the missing Values
dataset.isnull().sum()

In [None]:
### EXploratory Data Analysis
## Correlation
dataset.corr()

In [None]:
import seaborn as sns
sns.pairplot(dataset)

## Analyzing The Correlated Features

In [None]:
dataset.corr()

In [None]:
plt.scatter(dataset['CRIM'],dataset['Price'])
plt.xlabel("Crime Rate")
plt.ylabel("Price")

In [None]:
plt.scatter(dataset['RM'],dataset['Price'])
plt.xlabel("RM")
plt.ylabel("Price")

In [None]:
import seaborn as sns
sns.regplot(x="RM",y="Price",data=dataset)

In [None]:
sns.regplot(x="LSTAT",y="Price",data=dataset)

In [None]:
sns.regplot(x="CHAS",y="Price",data=dataset)

In [None]:
sns.regplot(x="PTRATIO",y="Price",data=dataset)

In [None]:
## Independent and Dependent features

X=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

In [None]:
X.head()

In [None]:
y

In [None]:
##Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)

In [None]:
X_test=scaler.transform(X_test)

In [None]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

In [None]:
X_train

In [None]:
X_test

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression=LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
## print the coefficients and the intercept
print(regression.coef_)

In [None]:
print(regression.intercept_)

In [None]:
## on which parameters the model has been trained
regression.get_params()

In [None]:
### Prediction With Test Data
reg_pred=regression.predict(X_test)

In [None]:
reg_pred

## Assumptions

In [None]:
## plot a scatter plot for the prediction
plt.scatter(y_test,reg_pred)

In [None]:
## Residuals
residuals=y_test-reg_pred

In [None]:
residuals

In [None]:
## Plot this residuals 

sns.displot(residuals,kind="kde")

In [None]:
## Scatter plot with respect to prediction and residuals
## uniform distribution
plt.scatter(reg_pred,residuals)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test,reg_pred))
print(mean_squared_error(y_test,reg_pred))
print(np.sqrt(mean_squared_error(y_test,reg_pred)))

## R square and adjusted R square


Formula

**R^2 = 1 - SSR/SST**


R^2	=	coefficient of determination
SSR	=	sum of squares of residuals
SST	=	total sum of squares


In [None]:
from sklearn.metrics import r2_score
score=r2_score(y_test,reg_pred)
print(score)

**Adjusted R2 = 1 – [(1-R2)*(n-1)/(n-k-1)]**

where:

R2: The R2 of the model
n: The number of observations
k: The number of predictor variables

In [None]:
#display adjusted R-squared
1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

## New Data Prediction

In [None]:
boston.data[0].reshape(1,-1)

In [None]:
##transformation of new data
scaler.transform(boston.data[0].reshape(1,-1))

In [None]:
regression.predict(scaler.transform(boston.data[0].reshape(1,-1)))

## Pickling The Model file For Deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression,open('regmodel.pkl','wb'))

In [None]:
pickled_model=pickle.load(open('regmodel.pkl','rb'))

In [None]:
## Prediction
pickled_model.predict(scaler.transform(boston.data[0].reshape(1,-1)))