## Boston Dataset with polynomial

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score


In [7]:
df = pd.read_csv(r"..\DataSets\Boston.csv")

In [8]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [9]:
X = df[['lstat']]
y = df['medv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [11]:
lr = LinearRegression()

In [12]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

0.5279301917445975

### Polynomial Features

In [13]:
ploy = PolynomialFeatures(degree=2, include_bias=False).set_output(transform='pandas')
X_ploy_trn = ploy.fit_transform(X_train)
X_ploy_tst = ploy.transform(X_test)

lr.fit(X_ploy_trn, y_train)
y_pred = lr.predict(X_ploy_tst)

In [17]:
len(y_test), len(y_pred)

(152, 152)

In [16]:
r2_score(y_test, y_pred)

0.6431149312301052

In [18]:
#Degree 3
ploy = PolynomialFeatures(degree=3, include_bias=False)
ploy.set_output(transform='pandas')
X_ploy_trn = ploy.fit_transform(X_train)
X_ploy_tst = ploy.transform(X_test)

lr.fit(X_ploy_trn, y_train)
y_pred = lr.predict(X_ploy_tst)

r2_score(y_test, y_pred)

0.6464637609380741

In [19]:
ploy = PolynomialFeatures(degree=4, include_bias=False)
ploy.set_output(transform='pandas')
X_ploy_trn = ploy.fit_transform(X_train)
X_ploy_tst = ploy.transform(X_test)

lr.fit(X_ploy_trn, y_train) 
y_pred = lr.predict(X_ploy_tst)

r2_score(y_test, y_pred)

0.6397924587030076

In [20]:
df.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'black', 'lstat', 'medv'],
      dtype='object')

Taking Two Variable `rad` and `lstat` 

In [21]:
X = df[['lstat', 'rad']]
y = df['medv']

# Split the data into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# r2 score
r2_score(y_test, y_pred)

0.5260520682885452

In [22]:
poly = PolynomialFeatures(degree=2, include_bias=False).set_output(transform='pandas')
X_ploy_trn = poly.fit_transform(X_train)
X_ploy_tst = poly.transform(X_test)

# Print  X_ploy_trn columns 
print(X_ploy_trn.columns)

# Fit the model
model = LinearRegression()
model.fit(X_ploy_trn, y_train)
 
# Make predictions
y_pred = model.predict(X_ploy_tst)

# r2 score
r2_score(y_test, y_pred)

Index(['lstat', 'rad', 'lstat^2', 'lstat rad', 'rad^2'], dtype='object')


0.6833321681968938

In [None]:
#y_pred = model.predict(X_ploy_tst)
#0.6586721124390901

For all the columns


In [53]:
ploy = PolynomialFeatures(degree=2, include_bias=False).set_output(transform='pandas')

In [54]:
X = df.drop('medv', axis=1)
y = df['medv']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [56]:
x_train_poly = ploy.fit_transform(X_train)

In [57]:
x_test_poly = ploy.transform(X_test)

In [62]:
lr = LinearRegression()
lr.fit(x_train_poly, y_train)

In [63]:
y_pred = lr.predict(x_test_poly)

In [64]:
#Print R2 score
print('R2 score: ', r2_score(y_test, y_pred))

R2 score:  0.6950286613899879


# Pipelines

In [66]:
X = df.drop(['lstat', 'rad'], axis=1)
y = df['medv']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

ploy = PolynomialFeatures(degree=3, include_bias=False).set_output(transform='pandas')
lr = LinearRegression()

In [68]:
pipe = Pipeline([('POLY', poly), ('LR', lr)])
pipe.fit(X_train, y_train)

In [70]:
pipe.named_steps

{'PLOY': PolynomialFeatures(degree=3, include_bias=False),
 'LR': LinearRegression()}

In [79]:
pipe.__str__()

"Pipeline(steps=[('PLOY', PolynomialFeatures(degree=3, include_bias=False)),\n                ('LR', LinearRegression())])"

In [80]:
pipe.__dict__

{'steps': [('PLOY', PolynomialFeatures(degree=3, include_bias=False)),
  ('LR', LinearRegression())],
 'memory': None,
 'verbose': False}

In [72]:
pipe.n_features_in_

12

In [73]:
pipe.feature_names_in_

array(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'tax',
       'ptratio', 'black', 'medv'], dtype=object)

In [69]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.9999135381026583