In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv('Boston.csv')

In [4]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B-1000,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [5]:
df.shape

(506, 14)

In [6]:
#separate the other attributes from the predicting attribute
x = df.drop('MEDV',axis=1)
#separte the predicting attribute into Y for model training 
y = df['MEDV']

In [7]:
objList = x.select_dtypes(include = "object").columns
print (objList)

Index([], dtype='object')


In [8]:
#Label Encoding for object to numeric conversion
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feat in objList:
    x[feat] = le.fit_transform(x[feat].astype(str))

print (x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B-1000   506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 51.5 KB
None


In [9]:
# importing train_test_split from sklearn
from sklearn.model_selection import train_test_split
# splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [10]:
x_test.shape

(102, 13)

In [11]:
# importing module
from sklearn.linear_model import LinearRegression
# creating an object of LinearRegression class
LR = LinearRegression()
# fitting the training data
LR.fit(x_train,y_train)

LinearRegression()

In [12]:
pred = LR.predict(x_test) #pred is my predicted value # y_test is my actual value

#calculating mse

mse = np.mean((pred - y_test)**2)

#r2=lreg.score(X_test,y_test) #inbuilt score function to calculate the R2 value

r2=LR.score(x_train,y_train)

#lets print it

print(mse)
print(r2)
r2R=LR.score(x_test,y_test)
print(r2R)
print(y_test)
print(y_train)

24.291119474973755
0.7508856358979673
0.6687594935356288
173    23.6
274    32.4
491    13.6
72     22.8
452    16.1
       ... 
412    17.9
436     9.6
411    17.2
86     22.5
75     21.4
Name: MEDV, Length: 102, dtype: float64
477    12.0
15     19.9
332    19.4
423    13.4
19     18.2
       ... 
106    19.5
270    21.1
348    24.5
435    13.4
102    18.6
Name: MEDV, Length: 404, dtype: float64


In [13]:
LR.coef_

array([-1.13055924e-01,  3.01104641e-02,  4.03807204e-02,  2.78443820e+00,
       -1.72026334e+01,  4.43883520e+00, -6.29636221e-03, -1.44786537e+00,
        2.62429736e-01, -1.06467863e-02, -9.15456240e-01,  1.23513347e-02,
       -5.08571424e-01])

In [14]:
LR.intercept_

30.246750993923555

In [15]:
from sklearn.linear_model import Ridge

## training the model

ridgeReg = Ridge(alpha= 2)# alpha is used for regularisation..!!0 to infinity..

ridgeReg.fit(x_train,y_train)

pred = ridgeReg.predict(x_test)

mse = np.mean((pred - y_test)**2)

print("mse for Ridge", mse)

train_r2R=ridgeReg.score(x_train,y_train)

print("Training R2 for Ridge", train_r2R)

test_r2R=ridgeReg.score(x_test,y_test)

print("Testing R2 for Ridge", test_r2R)

mse for Ridge 24.610934218227264
Training R2 for Ridge 0.7468342571619662
Testing R2 for Ridge 0.6643984101512583


In [16]:
ridgeReg.coef_

array([-1.07929630e-01,  3.32009818e-02, -5.14687975e-03,  2.40440104e+00,
       -6.59317640e+00,  4.45017330e+00, -1.42269994e-02, -1.29718750e+00,
        2.44806300e-01, -1.18331792e-02, -7.97017204e-01,  1.27473461e-02,
       -5.31622574e-01])

In [17]:
from sklearn.linear_model import Lasso

lassoReg = Lasso(alpha=0.1, normalize=True, max_iter=100)

lassoReg.fit(x_train,y_train)

pred = lassoReg.predict(x_test)

# calculating mse

mse = np.mean((pred - y_test)**2)

print(mse)

lasso= lassoReg.score(x_test,y_test)

print(lasso)

30.617500233710192
0.5824911941775328


In [18]:
Polynomial regression

SyntaxError: invalid syntax (Temp/ipykernel_19632/329432963.py, line 1)

In [19]:
print(x_train.shape)

(404, 13)


In [20]:
from sklearn.preprocessing import PolynomialFeatures
#applying polynomial regression degree 2
poly = PolynomialFeatures(degree=2, include_bias=True)


x_train_trans = poly.fit_transform(x_train)
x_test_trans = poly.fit_transform(x_test)

print(x_train_trans.shape)
print(x_test_trans.shape)

(404, 105)
(102, 105)


In [21]:
#include bias parameter
lr = LinearRegression()
lr.fit(x_train_trans, y_train)
y_pred = lr.predict(x_test_trans)
print(y_pred.shape)
print(y_test.shape)
print(x_test_trans.shape)
#x_test_trans.shape
print(r2_score(y_test, y_pred))


#r2=LR.score(x_test_trans,y_test)

#print(r2)
#print(lr.score(y_test, y_pred))

(102,)
(102,)
(102, 105)
0.8065890301302907


# Pipeline

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def polynomial_regression(degree):
    #X_new=np.linspace(-3, 3, 200).reshape(200, 1)
    X_new_poly = poly.transform(x)
    polybig_features = PolynomialFeatures(degree=degree, include_bias=True)
    std_scaler = StandardScaler()
    lin_reg = LinearRegression()
    polynomial_regression = Pipeline([
            ("poly_features", polybig_features),
            ("std_scaler", std_scaler),
            ("lin_reg", lin_reg),
        ])
    polynomial_regression.fit(x, y)
    
    y_newbig = polynomial_regression.predict(x)
       
    print(y_newbig.shape)
    
    print(r2_score(y, y_newbig))
    

In [28]:
polynomial_regression(2)

(506,)
0.9242352744054649


In [29]:
from sklearn.model_selection import GridSearchCV
#pipe = Pipeline([('Regressor' , LinearRegression())])

pipe = Pipeline([("PolynomialFeatures",PolynomialFeatures()),("std_scaler",StandardScaler()),("Regressor", LinearRegression())])

# Create param grid.

param_grid = [
    {'PolynomialFeatures' : [PolynomialFeatures()],
    'PolynomialFeatures__degree': list(range(1,10,1))},
    {'Regressor' : [LinearRegression()]},
    {'Regressor' : [Ridge()],
    'Regressor__alpha' : list(range(1,10,1))},
    {'Regressor' : [Lasso()],
    'Regressor__alpha' : list(range(0,5,1))}]

In [30]:
# Create grid search object

reg = GridSearchCV(pipe, param_grid = param_grid, cv = 3,verbose=True)

# Fit on data

best_reg = reg.fit(x, y)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [31]:
best_reg.best_estimator_.get_params()['Regressor']

Lasso(alpha=1)

In [32]:
print('Model accuracy is',best_reg.score(x, y))

Model accuracy is 0.7207000417838495


In [33]:
table = pd.DataFrame(best_reg.cv_results_)

In [None]:
table.to_excel("abc.xlsx")