# Polynomial Regression 

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [20]:
from sklearn.datasets import load_boston

In [46]:
boston = load_boston()

## splitting data

In [47]:
#Normal csv file splitting data
#x = df.iloc[starting row index:ending row index,starting column index:ending column index].values

#Splitting data when using inbuilt dataset from sklearn
features = boston.data
target = boston.target

In [48]:
y = target #output data
y.shape

(506,)

In [49]:
x = features #input data
x.shape

(506, 13)

## Spliting data into training and testing data

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
#Spliting data into training and testing data
#Spliting happens randomly so the accuracy changes everytime you run this line
train_input,test_input,train_output,test_output = train_test_split(x,y,test_size=.20)

In [52]:
train_input.shape

(404, 13)

In [53]:
test_input.shape

(102, 13)

In [54]:
train_output.shape

(404,)

In [55]:
test_output.shape

(102,)

# Choose ML algorithm

In [56]:
# Use Linear regression when all the values are continues not as a label
# for example predicting salary, sales,student marks, stock market prediciton etc.

### Here we will use Polynomial Regression

### Transforming data (Using Polynomial before applying regression)

In [57]:
# SimpleRegression => Simple Linear regression and multi Linear regression
# If r2_score is not close to 1 then Linear regression is not good for the the dataset
# Here now we have to use different Regression Model (Polynomial Regression)

In [58]:
# Polynomial Regression = Polynomial Features + Linear Regression 
#It's function is to transform data

In [59]:
from sklearn.preprocessing import PolynomialFeatures

In [60]:
poly = PolynomialFeatures(degree=2) 
poly_x_train = poly.fit_transform(train_input) # Tranforming x (input data) OR here in this case train_input

In [61]:
poly_x.shape

NameError: name 'poly_x' is not defined

### Now after transforming train_input data we can use Linear regression (Training Model)

In [62]:
from sklearn.linear_model import LinearRegression

In [63]:
lr = LinearRegression() # Initializing Linear regression

In [64]:
lr.fit(poly_x_train,train_output) # training the Linear regression model after polynomial transformation of data

LinearRegression()

In [65]:
from sklearn.metrics import r2_score,mean_squared_error #testing model accuracy
pred_train = lr.predict(poly_x_train) # Running Predictions on train dataset
pred_train

array([21.07478905, 17.74334621, 22.39944649, 15.38422012, 19.41930723,
       30.7045331 , 18.09601545, 31.80078745, 21.41993856, 31.70519352,
       46.43050075, 46.44360542, 25.0648849 , 27.91509724, 20.94106054,
       24.5342803 , 49.71924782, 19.88693023, 20.91808176, 26.78137755,
       36.39626598, 12.64433169, 34.29053569, 22.53566742, 13.32833886,
       18.70858884, 18.66699553, 37.43954825, 18.19873381, 18.52424145,
       13.6552453 , 28.61743569, 30.40082049, 25.30676532,  8.94958591,
       12.35798335, 17.32198215, 25.22123694, 30.66875434, 16.09969258,
       14.81765413, 27.48261094, 31.14892006, 17.06038547, 18.38656974,
       25.25455999, 24.55961823, 11.48654127, 32.03584599, 14.60023284,
       24.78943682, 21.07302856, 26.50235677, 18.81902146, 29.11787558,
       14.05969477, 34.81720781, 13.6966784 , 49.03512263,  9.68800211,
       21.66463947, 15.62257242, 13.43766189, 20.67768979, 12.90292358,
       31.71448874, 22.33394766, 14.55894852, 52.59150457, 34.62

In [66]:
score_train = r2_score(train_output,pred_train) # scoring our Polynomial regression model
score_train

0.9074659993092982

## (Testing model)

In [67]:
# Just like what we did with our training data we need to transform the test input 
poly_x_test = poly.transform(test_input)

In [68]:
pred_test = lr.predict(poly_x_test) #Running prediction on test dataset
score_test = r2_score(test_output,pred_test)
score_test

0.7113441675773511

In [69]:
# Doubts
# 1. Why is the score_train is higher than score_test
# 2. How can I improve my score_test (I am using the default degree = 2 while using polynomial)
# 3. By increasing the degree the score_train goes negative

## Analysis

In [None]:
# If r2 score goes low in test data and train data then the model is underfitted
# If r2 score goes negative in test data then the model is overfitted also actual values and predicted vales have a lot of difference

In [None]:
# If features are more than 5 then use Polynomial regression
# If features are very high then use tensorflow
# If features are less than 5 then use Linear regression

In [70]:
# so here we can find the mean squared error of the model on train data
mean_squared_error(train_output,pred_train)

7.859458774175747

In [71]:
# so here we can find the mean squared error of the model on test data
mean_squared_error(test_output,pred_test)

23.63885651588584

## Using Feature Scaling to improve the performance of our Polynomial Regression model

## Trying StandardScalar

In [75]:
### Option 
# implement Feature scaling to improve score_test
# minmaxScalar is used in image scaling
# StandarScaling is used for other types of data

In [84]:
from sklearn.preprocessing import StandardScaler #improving performance using feature scaling

In [101]:
# This Feature Scaling is only used to transform data and not for actualing model training
# Feature scaling is a method used to normalize the range of independent variables or features of data.
std = StandardScaler() #initializing

### Transforming data using Feature Scaling (StandardScaler)

In [105]:
#This can improve our Polynomial Regression model
std_train = std.fit_transform(train_input)
std_test = std.transform(test_input)

### Transforming data using Polynomial Function class

In [106]:
poly2 = PolynomialFeatures()
poly2_train = poly.fit_transform(std_train)
poly2_test = poly.transform(std_test)

### Now we can apply Linear Regression

In [107]:
lr2 = LinearRegression()

In [108]:
lr2.fit(poly2_train,train_output)

LinearRegression()

In [109]:
pred2_train = lr2.predict(poly2_train) # Running Predictions on train dataset
pred2_train

array([21.97906682, 18.22321645, 23.42584986, 13.41701063, 18.76177797,
       31.82370238, 18.9961868 , 32.22289749, 20.44488679, 32.96345895,
       45.24892974, 44.49382069, 24.22829617, 28.25630157, 21.31008354,
       23.6494378 , 52.04113537, 18.68009715, 22.05888542, 26.30002634,
       36.39612761, 17.36463301, 32.97284141, 24.0578862 , 12.05717358,
       15.79499151, 19.73282873, 34.40831341, 17.34724996, 18.05985855,
       12.25094486, 28.30214527, 29.32364077, 24.03581905,  9.06712678,
        8.19796503, 17.25515613, 26.02814497, 31.3066702 , 17.93872867,
       14.7864666 , 26.65711467, 27.03644458, 19.76244562, 17.97459125,
       22.50615831, 23.07000689, 10.09743417, 32.89158547,  9.23608375,
       23.11341509, 20.07168328, 27.34770637, 18.41913814, 29.73176034,
       15.57966815, 35.03244998,  9.62398503, 49.14847163, 11.5681914 ,
       21.44574498, 19.27333546, 13.67283824, 19.95455844, 11.23528921,
       32.08428768, 21.10699747, 17.16038873, 54.32577227, 34.72

In [115]:
predic2_test = lr2.predict(poly2_test)
predic2_test

array([12.05003104, 18.58262108, 24.53098294, 15.21959002, 19.74951108,
       23.06676967, 20.32821   , 17.52325867, 21.73265815,  5.48388502,
        9.70244668, 38.07458684, 52.24905115, 35.86533707, 22.83879839,
       16.39265666, 38.4707319 , 23.73707742, 16.62368671, 32.28887903,
       20.12944675, 35.80642178, 34.80992314, 25.27908475, 21.80854262,
       24.8398611 , 17.83564289, 19.37487131, 15.12838568, 18.20239964,
        8.19558984, 18.26054517, 26.13544587, 25.75884664, 25.62721241,
       30.83415384, 19.78977743, 16.25095409,  8.7669474 ,  8.7176217 ,
       11.88615757, 42.75664418, 32.19216283, 11.01617849, 28.02735502,
        9.09166085, 37.92017805, 18.38648582, 13.05523938, 15.92799948,
       26.96216122, 18.54873669, 40.75993291, 23.24311045, 23.12590533,
       24.20943405, 32.30203621, 18.10889953, 21.59632372, 31.12001042,
       14.60959687, 47.34533183, 24.01554754, 24.90200794, 19.81886678,
       16.11888534, 44.03239212, 35.96632846, 21.18569741, 21.41

In [116]:
# 0.7113441675773511 was our previous score test
score2_test = r2_score(test_output,predic2_test) #here we can see an improvement in our score_test
score2_test

0.7926112315869682

In [118]:
score_train = r2_score(train_output,pred2_train)
score_train

0.9407361986620222

## Trying MinMaxScalar

In [122]:
from sklearn.preprocessing import MinMaxScaler #improving performance using feature scaling

In [123]:
minmax_scaler = MinMaxScaler() # initialization of minmaxScalar

### Transforming data using Feature Scaling (MinMaxScaler)

In [125]:
minmax_scaler_train = std.fit_transform(train_input)
minmax_scaler_test = std.transform(test_input)

### Transforming data using Polynomial Function class

In [126]:
poly3 = PolynomialFeatures()
poly3_train = poly.fit_transform(minmax_scaler_train)
poly3_test = poly.transform(minmax_scaler_test)

### Now we can apply Linear Regression¶

In [127]:
lr3 = LinearRegression()

In [128]:
lr3.fit(poly3_train,train_output)

LinearRegression()

In [129]:
pred3_train = lr3.predict(poly3_train) # Running Predictions on train dataset
pred3_train

array([21.97906682, 18.22321645, 23.42584986, 13.41701063, 18.76177797,
       31.82370238, 18.9961868 , 32.22289749, 20.44488679, 32.96345895,
       45.24892974, 44.49382069, 24.22829617, 28.25630157, 21.31008354,
       23.6494378 , 52.04113537, 18.68009715, 22.05888542, 26.30002634,
       36.39612761, 17.36463301, 32.97284141, 24.0578862 , 12.05717358,
       15.79499151, 19.73282873, 34.40831341, 17.34724996, 18.05985855,
       12.25094486, 28.30214527, 29.32364077, 24.03581905,  9.06712678,
        8.19796503, 17.25515613, 26.02814497, 31.3066702 , 17.93872867,
       14.7864666 , 26.65711467, 27.03644458, 19.76244562, 17.97459125,
       22.50615831, 23.07000689, 10.09743417, 32.89158547,  9.23608375,
       23.11341509, 20.07168328, 27.34770637, 18.41913814, 29.73176034,
       15.57966815, 35.03244998,  9.62398503, 49.14847163, 11.5681914 ,
       21.44574498, 19.27333546, 13.67283824, 19.95455844, 11.23528921,
       32.08428768, 21.10699747, 17.16038873, 54.32577227, 34.72

In [130]:
predic3_test = lr3.predict(poly3_test)
predic3_test

array([12.05003104, 18.58262108, 24.53098294, 15.21959002, 19.74951108,
       23.06676967, 20.32821   , 17.52325867, 21.73265815,  5.48388502,
        9.70244668, 38.07458684, 52.24905115, 35.86533707, 22.83879839,
       16.39265666, 38.4707319 , 23.73707742, 16.62368671, 32.28887903,
       20.12944675, 35.80642178, 34.80992314, 25.27908475, 21.80854262,
       24.8398611 , 17.83564289, 19.37487131, 15.12838568, 18.20239964,
        8.19558984, 18.26054517, 26.13544587, 25.75884664, 25.62721241,
       30.83415384, 19.78977743, 16.25095409,  8.7669474 ,  8.7176217 ,
       11.88615757, 42.75664418, 32.19216283, 11.01617849, 28.02735502,
        9.09166085, 37.92017805, 18.38648582, 13.05523938, 15.92799948,
       26.96216122, 18.54873669, 40.75993291, 23.24311045, 23.12590533,
       24.20943405, 32.30203621, 18.10889953, 21.59632372, 31.12001042,
       14.60959687, 47.34533183, 24.01554754, 24.90200794, 19.81886678,
       16.11888534, 44.03239212, 35.96632846, 21.18569741, 21.41

In [131]:
# 0.7113441675773511 was our previous score test
score3_test = r2_score(test_output,predic3_test) #here we can see an improvement in our score_test
score3_test

0.7926112315869682

In [132]:
score_train = r2_score(train_output,pred3_train)
score_train

0.9407361986620222

## Conclusion

### The improvement in this case after applying StandardScaler and Minmax Scaler is the same