# Polynomial Regression 

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
from sklearn.datasets import load_boston

In [7]:
boston = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

## splitting data

In [8]:
#Normal csv file splitting data
#x = df.iloc[starting row index:ending row index,starting column index:ending column index].values

#Splitting data when using inbuilt dataset from sklearn
features = boston.data
target = boston.target

In [9]:
y = target #output data
y.shape

(506,)

In [10]:
x = features #input data
x.shape

(506, 13)

## Spliting data into training and testing data

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
#Spliting data into training and testing data
#Spliting happens randomly so the accuracy changes everytime you run this line
train_input,test_input,train_output,test_output = train_test_split(x,y,test_size=.20)

In [13]:
train_input.shape

(404, 13)

In [14]:
test_input.shape

(102, 13)

In [15]:
train_output.shape

(404,)

In [16]:
test_output.shape

(102,)

# Choose ML algorithm

In [17]:
# Use Linear regression when all the values are continues not as a label
# for example predicting salary, sales,student marks, stock market prediciton etc.

### Here we will use Polynomial Regression

### Transforming data (Using Polynomial before applying regression)

In [18]:
# SimpleRegression => Simple Linear regression and multi Linear regression
# If r2_score is not close to 1 then Linear regression is not good for the the dataset
# Here now we have to use different Regression Model (Polynomial Regression)

In [19]:
# Polynomial Regression = Polynomial Features + Linear Regression 
#It's function is to transform data

In [20]:
from sklearn.preprocessing import PolynomialFeatures

In [21]:
poly = PolynomialFeatures(degree=2) 
poly_x_train = poly.fit_transform(train_input) # Tranforming x (input data) OR here in this case train_input

In [22]:
poly_x_train.shape

(404, 105)

### Now after transforming train_input data we can use Linear regression (Training Model)

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
lr = LinearRegression() # Initializing Linear regression

In [25]:
lr.fit(poly_x_train,train_output) # training the Linear regression model after polynomial transformation of data

LinearRegression()

In [26]:
from sklearn.metrics import r2_score,mean_squared_error #testing model accuracy
pred_train = lr.predict(poly_x_train) # Running Predictions on train dataset
pred_train

array([20.6930415 , 23.6581257 , 21.06322443, 22.32209045, 48.54161787,
       12.21099192, 36.45241225, 21.1039474 , 30.74155462, 16.08465916,
       20.60205317, 21.52912486, 22.32050937, 15.73946315, 12.73871499,
       44.64164591, 18.44041938, 12.23791862, 25.30872148, 16.23223454,
       13.2331475 , 12.87222344, 26.22274822, 35.86635786, 15.1322574 ,
       22.32731271, 20.62273693, 17.3646481 , 11.76976115, 23.69609922,
       24.60297728, 49.02461439, 27.72601062, 31.65680027, 16.87957692,
       40.03245473, 20.97564119, 34.50768816, 28.72660124, 14.67006934,
       33.45040643, 15.00686419, 18.25875926, 21.63218021, 31.41995686,
       15.15639961, 26.29266602, 23.24514222, 19.08529377, 34.99443167,
       16.42653817, 35.36771852, 29.31458229, 25.61130196, 31.66979361,
       52.20419037, 17.52417904, 17.57542837, 44.24207479, 26.96084428,
       19.80561978, 16.25611031, 42.26486677, 34.46499455, 43.76984358,
       20.41403371, 21.76890701, 24.2191385 , 26.31307018, 18.94

In [27]:
score_train = r2_score(train_output,pred_train) # scoring our Polynomial regression model
score_train

0.9442339776049897

## (Testing model)

In [28]:
# Just like what we did with our training data we need to transform the test input 
poly_x_test = poly.transform(test_input)

In [29]:
pred_test = lr.predict(poly_x_test) #Running prediction on test dataset
score_test = r2_score(test_output,pred_test)
score_test

0.7541666347924117

In [30]:
# Doubts
# 1. Why is the score_train is higher than score_test
# 2. How can I improve my score_test (I am using the default degree = 2 while using polynomial)
# 3. By increasing the degree the score_train goes negative

## Analysis

In [31]:
# If r2 score goes low in test data and train data then the model is underfitted
# If r2 score goes negative in test data then the model is overfitted also actual values and predicted vales have a lot of difference

In [32]:
# If features are more than 5 then use Polynomial regression
# If features are very high then use tensorflow
# If features are less than 5 then use Linear regression

In [33]:
# so here we can find the mean squared error of the model on train data
mean_squared_error(train_output,pred_train)

4.9089827293506705

In [34]:
# so here we can find the mean squared error of the model on test data
mean_squared_error(test_output,pred_test)

17.200976867402517

## Using Feature Scaling to improve the performance of our Polynomial Regression model

## Trying StandardScalar

In [35]:
### Option 
# implement Feature scaling to improve score_test
# minmaxScalar is used in image scaling
# StandarScaling is used for other types of data

In [36]:
from sklearn.preprocessing import StandardScaler #improving performance using feature scaling

In [37]:
# This Feature Scaling is only used to transform data and not for actualing model training
# Feature scaling is a method used to normalize the range of independent variables or features of data.
std = StandardScaler() #initializing

### Transforming data using Feature Scaling (StandardScaler)

In [38]:
#This can improve our Polynomial Regression model
std_train = std.fit_transform(train_input)
std_test = std.transform(test_input)

### Transforming data using Polynomial Function class

In [39]:
poly2 = PolynomialFeatures()
poly2_train = poly.fit_transform(std_train)
poly2_test = poly.transform(std_test)

### Now we can apply Linear Regression

In [40]:
lr2 = LinearRegression()

In [41]:
lr2.fit(poly2_train,train_output)

LinearRegression()

In [42]:
pred2_train = lr2.predict(poly2_train) # Running Predictions on train dataset
pred2_train

array([20.91992188, 23.72729492, 21.13208008, 22.34448242, 49.03491211,
       12.22631836, 36.67944336, 20.96948242, 30.9621582 , 16.28222656,
       20.75317383, 21.88867188, 22.66040039, 15.55444336, 12.67382812,
       44.84399414, 19.60595703, 12.36791992, 25.06347656, 16.48657227,
       13.47216797, 12.90307617, 26.41552734, 35.83203125, 15.23388672,
       21.70263672, 20.21826172, 17.40185547, 11.92456055, 23.89550781,
       24.80126953, 49.86376953, 27.69482422, 31.3828125 , 17.00097656,
       39.890625  , 20.59423828, 33.76586914, 28.78515625, 15.64526367,
       33.88793945, 14.73657227, 18.31396484, 21.70556641, 31.85083008,
       15.13330078, 26.19580078, 23.27001953, 19.06005859, 35.59521484,
       16.92285156, 35.37670898, 29.71850586, 25.35668945, 31.61035156,
       49.97314453, 17.68286133, 17.38208008, 44.0690918 , 26.88842773,
       20.04833984, 16.02099609, 42.72021484, 34.1706543 , 43.08642578,
       20.65087891, 21.89013672, 23.76904297, 26.31518555, 20.01

In [43]:
predic2_test = lr2.predict(poly2_test)
predic2_test

array([26.18359375, 19.70996094, 21.80639648, 26.95166016, 23.79394531,
       31.8972168 , 24.85229492, 19.93334961, 24.71582031, 15.81958008,
       18.03710938, 11.59521484, 28.71582031, 23.73486328, 18.23706055,
        3.0546875 , 18.54663086, 31.16503906, 17.86328125,  8.42480469,
       19.74121094, 20.27636719, 26.73120117, 23.58300781,  8.62524414,
       15.25927734, 18.51171875, 15.38623047, 11.85913086, 17.07006836,
       24.90234375, 12.22265625, 29.24536133, 30.87451172,  8.00439453,
       19.07299805, 22.28613281, 27.60205078, 30.84619141, 16.58618164,
       13.86376953, 17.28295898, 14.26025391, 17.78564453, 25.86401367,
       27.71533203, 29.11865234, 25.38330078, 25.57763672, 32.30517578,
       21.6003418 , 17.38745117, 28.23120117, 30.25048828, 32.41162109,
       19.88427734, 22.91064453, 24.79516602, 18.24291992, 17.50830078,
       28.53442383, 21.27319336, 17.39501953, 19.49438477, 20.11450195,
       20.18725586, 26.04003906, 27.33544922, 16.46362305, 22.87

In [44]:
# 0.7113441675773511 was our previous score test
score2_test = r2_score(test_output,predic2_test) #here we can see an improvement in our score_test
score2_test

0.753818509604233

In [45]:
score_train = r2_score(train_output,pred2_train)
score_train

0.9457836683730921

## Trying MinMaxScalar

In [46]:
from sklearn.preprocessing import MinMaxScaler #improving performance using feature scaling

In [47]:
minmax_scaler = MinMaxScaler() # initialization of minmaxScalar

### Transforming data using Feature Scaling (MinMaxScaler)

In [48]:
minmax_scaler_train = minmax_scaler.fit_transform(train_input)
minmax_scaler_test = minmax_scaler.transform(test_input)

### Transforming data using Polynomial Function class

In [49]:
poly3 = PolynomialFeatures()
poly3_train = poly.fit_transform(minmax_scaler_train)
poly3_test = poly.transform(minmax_scaler_test)

### Now we can apply Linear Regression¶

In [50]:
lr3 = LinearRegression()

In [51]:
lr3.fit(poly3_train,train_output)

LinearRegression()

In [52]:
pred3_train = lr3.predict(poly3_train) # Running Predictions on train dataset
pred3_train

array([20.91785468, 23.64719438, 21.06627923, 22.27901071, 49.06895994,
       12.2488822 , 36.75690794, 21.05093911, 31.04506502, 16.30763677,
       20.71877858, 21.84936325, 22.55728653, 15.53891461, 12.65591945,
       44.81125682, 19.53541357, 12.38396929, 25.05907976, 16.52587189,
       13.39174075, 12.92099605, 26.39956128, 35.7587966 , 15.25299193,
       21.65732273, 20.17125944, 17.34635733, 11.94390355, 23.8400201 ,
       24.71508952, 49.81649068, 27.65275502, 31.41162046, 16.98211277,
       39.81047307, 20.40300425, 33.79256567, 28.7969134 , 15.5783048 ,
       33.91426435, 14.74641548, 18.29607786, 21.71470958, 31.82367395,
       15.13103543, 26.19647815, 23.20292748, 19.07964701, 35.60503659,
       16.88150725, 35.44131428, 29.66014144, 25.260668  , 31.52910394,
       49.88240036, 17.72723761, 17.35750626, 44.06675163, 26.88280598,
       19.98382167, 16.03183715, 42.76145416, 34.17838023, 43.11662851,
       20.62450309, 21.9100008 , 23.82381873, 26.32040325, 19.99

In [53]:
predic3_test = lr3.predict(poly3_test)
predic3_test

array([26.17374769, 19.72762456, 21.78295473, 26.92225724, 23.74754038,
       31.85512054, 24.85311963, 19.91731026, 24.68621965, 15.83646514,
       18.06964884, 11.58365066, 28.70357444, 23.65599127, 18.27345352,
        2.97824489, 18.48343818, 31.19801269, 17.93474187,  8.3686581 ,
       19.78817878, 20.18912846, 26.75627855, 23.51987838,  8.64784006,
       15.25874744, 18.49984381, 15.49247876, 11.83372446, 17.03954964,
       24.93887091, 12.23111479, 29.09810223, 30.92508145,  7.95939395,
       19.00400103, 22.20815693, 27.6804525 , 30.85195964, 16.60698724,
       13.84976479, 17.27395516, 14.27911434, 17.80091579, 25.86626956,
       27.48707315, 29.16069353, 25.24980752, 25.58799603, 32.26266744,
       21.54735987, 17.4519266 , 28.17089299, 30.26715989, 32.38999448,
       19.87420466, 22.90864902, 24.75638119, 18.25230595, 17.47668929,
       28.50037988, 21.30310747, 17.42562531, 19.4870397 , 20.08008127,
       20.22637485, 26.04554496, 27.37373823, 16.47093065, 22.81

In [54]:
# 0.7113441675773511 was our previous score test
score3_test = r2_score(test_output,predic3_test) #here we can see an improvement in our score_test
score3_test

0.7551938387815483

In [55]:
score_train = r2_score(train_output,pred3_train)
score_train

0.9458112923253474

## Conclusion

### The improvement in this case after applying StandardScaler and Minmax Scaler is the same

In [57]:
# What is the difference between StandardScalar and MinMaxScalar

In [58]:
# Minmax scalar shifts all values in between 0 and 1 (used in image form of data)
# Satndard scalar mean =0 and variance = 1 (used in other types of data other than image)