### IMPORTING THE LIBRARIES WE USE IN THIS NOTEBOOK

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor 
from sklearn.dummy import DummyRegressor

np.random.seed(86089106)

### LOADING THE DATA INTO A DATAFRAME

In [2]:
df = pd.read_csv('model4_dataset.csv') # you may need to edit this path depending on where you saved the data
df.head(3)

Unnamed: 0,input,target
0,16.243454,8803.949866
1,-6.117564,-657.602516
2,-5.281718,-210.705031


### CONDUCTING AN INITIAL EXPLORATION OF DATA

In [3]:
df.shape 

(1000, 2)

In [4]:
df.isnull().sum()

input     0
target    0
dtype: int64

In [5]:
df.dtypes

input     float64
target    float64
dtype: object

In [6]:
df.describe()

Unnamed: 0,input,target
count,1000.0,1000.0
mean,0.388125,193.899404
std,9.81495,8197.207914
min,-30.537644,-56008.552989
25%,-6.001604,-395.254172
50%,0.412926,24.164536
75%,7.039989,736.823944
max,39.586027,125615.034876


### Incorporate data splitting (cross-validation) to train the model (80/20 split)

In [7]:
from sklearn.model_selection import train_test_split

X = df[['input']]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) # 80% training and 20% test

y_train

382    -7251.253424
994     4414.314652
982      129.776296
47     19066.376344
521     -359.458755
           ...     
767      -60.344258
72       165.028511
908    11944.896437
235    -3583.404216
37       -17.496463
Name: target, Length: 800, dtype: float64

### SAVING THE TEST AND TRAINING DATA TO CSV FILES

In [8]:
train = X_train.copy() # if you don't use copy(), then the original X_train will be modified when you add the y_train column
train['target'] = y_train
train.head(3)

Unnamed: 0,input,target
382,-15.387932,-7251.253424
994,12.911889,4414.314652
982,1.751677,129.776296


In [9]:
train.to_csv('ex_01_train.csv', index=False)

In [10]:
test = X_test.copy() # if you don't use copy(), then the original X_test will be modified when you add the y_test column
test['target'] = y_test
test.head(3)

Unnamed: 0,input,target
507,0.896864,-141.859686
818,-3.260613,87.151159
452,6.743961,670.791378


In [11]:
test.to_csv('ex_01_test.csv', index=False)

## ANALYSING THE FIT WITH GRADIENT DESCENT MODEL

### LOADING THE DATA

In [12]:
train = pd.read_csv("ex_01_train.csv")
test = pd.read_csv("ex_01_test.csv")

In [13]:
target = 'target'
predictors = list(train.columns)
predictors.remove(target) 

In [14]:
print(predictors)

['input']


In [15]:
# arrrange this data into X_train, X_test, y_train, and y_test...
X_train = train[predictors]
y_train = train[target]
X_test = test[predictors]
y_test = test[target]

In [16]:
y_train

0      -7251.253424
1       4414.314652
2        129.776296
3      19066.376344
4       -359.458755
           ...     
795      -60.344258
796      165.028511
797    11944.896437
798    -3583.404216
799      -17.496463
Name: target, Length: 800, dtype: float64

In [17]:
results = pd.DataFrame(y_train, columns=["actual"])

rmses = pd.DataFrame({"model": [], "rmse": []})

In [18]:
#Stochastic Gradient Descent:
# https://scikit-learn.org/stable/modules/sgd.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor


# eta0 = learning rate
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.01)
sgd_reg.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg.n_iter_}")

results["SGD_preds"] = sgd_reg.predict(X_train)

Number of iterations = 11


In [19]:
#SGD Test RMSE
SGD_test_pred = sgd_reg.predict(X_test)
SGD_test_rmse = np.sqrt(mean_squared_error(y_test, SGD_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD", 'rmse': SGD_test_rmse}, index=[0])])

print(f"SGD Test RMSE: {SGD_test_rmse:.3f}")

SGD Test RMSE: 4333.890


## ANALYSING THE FIT WITH LINEAR REGRESSION USING THIRD ORDER POLYNOMIAL 

In [20]:
new_df = pd.DataFrame()
new_df['x1'] = X_train
new_df['x2'] = X_train**2 
new_df['x3'] = X_train**3                               
new_df['y'] = y_train

new_df.head(4)

Unnamed: 0,x1,x2,x3,y
0,-15.387932,236.788465,-3643.684911,-7251.253424
1,12.911889,166.716878,2152.629832,4414.314652
2,1.751677,3.068373,5.3748,129.776296
3,21.002551,441.107164,9264.375866,19066.376344


In [21]:
X1 = new_df[['x1', 'x2','x3']]
y1 = new_df['y']

print(X1.shape)
print(y1.shape)


(800, 3)
(800,)


In [22]:
X1

Unnamed: 0,x1,x2,x3
0,-15.387932,236.788465,-3643.684911
1,12.911889,166.716878,2152.629832
2,1.751677,3.068373,5.374800
3,21.002551,441.107164,9264.375866
4,-5.396816,29.125619,-157.185595
...,...,...,...
795,2.373327,5.632681,13.368194
796,1.600371,2.561186,4.098848
797,18.035890,325.293321,5866.954503
798,-12.331207,152.058675,-1875.067049


In [23]:
new_df.shape

(800, 4)

In [24]:
lin_reg = LinearRegression()

lin_reg = LinearRegression().fit(new_df[['x1', 'x2', 'x3']],new_df[['y']])

#lin_reg = LinearRegression().fit(X1,y1)

# note: the double square brackets are important! 

# see pandas_dataframe_demo.ipynb for more details on how to work with pandas dataframes

In [25]:
b0 = lin_reg.intercept_[0]
b1 = lin_reg.coef_[0][0]
b2 = lin_reg.coef_[0][1]
b3 = lin_reg.coef_[0][2]

r2 = lin_reg.score(X1, y1)

In [26]:
print(f"y = {b0:.2f} + {b1:.2f}x + + {b2:.2f}x^2")
print(f"R^2: {lin_reg.score(X1, y1):.3f}") 
# for more on fstrings see here...
# https://www.freecodecamp.org/news/python-f-strings-tutorial-how-to-use-f-strings-for-string-formatting/

y = -0.99 + -1.15x + + 1.03x^2
R^2: 1.000


In [27]:
new_df2 = pd.DataFrame()
new_df2['x1'] = X_test
new_df2['x2'] = X_test**2 
new_df2['x3'] = X_test**3                               # here we have engineered a new feature
new_df2['y'] = y_test

new_df.head(3)

Unnamed: 0,x1,x2,x3,y
0,-15.387932,236.788465,-3643.684911,-7251.253424
1,12.911889,166.716878,2152.629832,4414.314652
2,1.751677,3.068373,5.3748,129.776296


In [28]:
X2 = new_df2[['x1', 'x2','x3']]
y2 = new_df2['y']

print(X2.shape)
print(y2.shape)


(200, 3)
(200,)


In [29]:
lin_reg_test_pred = lin_reg.predict(X2)
lin_test_rmse = np.sqrt(mean_squared_error(y2, lin_reg_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"lin_reg_3rd_order", 'rmse': lin_test_rmse}, index=[0])])

print(f"lin_test_rmse: {lin_test_rmse:.3f}")

lin_test_rmse: 102.176


## ANALYSING THE FIT WITH SGD POLYNOMIAL TRANSFORMATION

In [30]:
from sklearn.preprocessing import PolynomialFeatures

# Create second degree terms and interaction terms
poly_features = PolynomialFeatures(degree=2).fit(X_train)
X_train_poly = poly_features.transform(X_train)
X_test_poly = poly_features.transform(X_test)

#This will create the polynomial terms of the categorical variables too (since they are encoded as numbers)

#if degree=3, then it creates all combinations: a, a^2, a^3, b, b^2, b^3, a.b, a^2.b, a.b^2, a^2.b^2 

In [31]:
X_train_poly

array([[  1.        , -15.38793246, 236.78846531],
       [  1.        ,  12.91188903, 166.71687828],
       [  1.        ,   1.75167729,   3.06837334],
       ...,
       [  1.        ,  18.03588981, 325.29332141],
       [  1.        , -12.33120735, 152.05867482],
       [  1.        ,   2.34415698,   5.49507194]])

In [32]:

poly_lin_reg = SGDRegressor(max_iter=1000, penalty=None, eta0=0.01) 
poly_lin_reg.fit(X_train_poly, np.ravel(y_train))

print(f"Number of iterations = {poly_lin_reg.n_iter_}")

results["SGD_preds_ using polynomial"] = poly_lin_reg.predict(X_train_poly)

Number of iterations = 22


In [33]:
# Train RMSE
# SGD with polynomial input
poly_test_pred = poly_lin_reg.predict(X_test_poly)
poly_test_rmse = np.sqrt(mean_squared_error(y_test, poly_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD Poly", 'rmse': poly_test_rmse}, index=[0])])

print(f"SGD wt Polynomial input Test RMSE: {poly_test_rmse:.3f}")

SGD wt Polynomial input Test RMSE: 12355084802674.627


### RESULTS

In [34]:
rmses.sort_values(by=['rmse'])

Unnamed: 0,model,rmse
0,lin_reg_3rd_order,102.1762
0,SGD,4333.89
0,SGD Poly,12355080000000.0


### CONCLUSION

Based on the resulted root mean square error (RMSE) values for the different models, we can draw the following conclusions regarding their performance:

1. lin_reg_3rd_order: The linear regression model with a third-degree polynomial transformation has an RMSE of 102.1762. This indicates that the model's predictions, on average, deviate by approximately 102.1762 units from the actual values. It suggests a moderate level of accuracy.

2. SGD: The Stochastic Gradient Descent (SGD) model has a significantly higher RMSE of 4333.89. This suggests that the model's predictions have a much larger average deviation from the actual values compared to the lin_reg_3rd_order model. The SGD model performs relatively worse in terms of accuracy.

3. SGD Poly: The SGD model with polynomial transformation has an extremely high RMSE of 1.235508e+13, which is significantly larger than the other models. This indicates that the model's predictions have a massive average deviation from the actual values. The SGD Poly model performs significantly worse compared to both lin_reg_3rd_order and SGD models.

Based on the RMSE values, the lin_reg_3rd_order model performs the best among the three models, as it has the lowest RMSE and exhibits a relatively better accuracy in predicting the target variable.