### IMPORTING THE LIBRARIES WE USE IN THIS NOTEBOOK

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor 
from sklearn.dummy import DummyRegressor

np.random.seed(86089106)

### LOADING THE DATA INTO A DATAFRAME

In [2]:
df = pd.read_csv('model4_dataset.csv') # you may need to edit this path depending on where you saved the data
df.head(3)

Unnamed: 0,input,target
0,16.243454,8803.949866
1,-6.117564,-657.602516
2,-5.281718,-210.705031


### CONDUCTING AN INITIAL EXPLORATION OF DATA

In [3]:
df.shape 

(1000, 2)

In [4]:
df.isnull().sum()

input     0
target    0
dtype: int64

In [5]:
df.dtypes

input     float64
target    float64
dtype: object

In [6]:
df.describe()

Unnamed: 0,input,target
count,1000.0,1000.0
mean,0.388125,193.899404
std,9.81495,8197.207914
min,-30.537644,-56008.552989
25%,-6.001604,-395.254172
50%,0.412926,24.164536
75%,7.039989,736.823944
max,39.586027,125615.034876


### Incorporate data splitting (cross-validation) to train the model (80/20 split)

In [7]:
from sklearn.model_selection import train_test_split

X = df[['input']]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) # 80% training and 20% test

y_train

382    -7251.253424
994     4414.314652
982      129.776296
47     19066.376344
521     -359.458755
           ...     
767      -60.344258
72       165.028511
908    11944.896437
235    -3583.404216
37       -17.496463
Name: target, Length: 800, dtype: float64

### SAVING THE TEST AND TRAINING DATA TO CSV FILES

In [8]:
train = X_train.copy() # if you don't use copy(), then the original X_train will be modified when you add the y_train column
train['target'] = y_train
train.head(3)

Unnamed: 0,input,target
382,-15.387932,-7251.253424
994,12.911889,4414.314652
982,1.751677,129.776296


In [9]:
train.to_csv('ex_01_train.csv', index=False)

In [10]:
test = X_test.copy() # if you don't use copy(), then the original X_test will be modified when you add the y_test column
test['target'] = y_test
test.head(3)

Unnamed: 0,input,target
507,0.896864,-141.859686
818,-3.260613,87.151159
452,6.743961,670.791378


In [11]:
test.to_csv('ex_01_test.csv', index=False)

## ANALYSING THE FIT WITH GRADIENT DESCENT MODEL

### LOADING THE DATA

In [12]:
train = pd.read_csv("ex_01_train.csv")
test = pd.read_csv("ex_01_test.csv")

In [13]:
target = 'target'
predictors = list(train.columns)
predictors.remove(target) 

In [14]:
print(predictors)

['input']


In [15]:
# arrrange this data into X_train, X_test, y_train, and y_test...
X_train = train[predictors]
y_train = train[target]
X_test = test[predictors]
y_test = test[target]

In [16]:
y_train

0      -7251.253424
1       4414.314652
2        129.776296
3      19066.376344
4       -359.458755
           ...     
795      -60.344258
796      165.028511
797    11944.896437
798    -3583.404216
799      -17.496463
Name: target, Length: 800, dtype: float64

In [17]:
results = pd.DataFrame(y_train, columns=["actual"])

rmses = pd.DataFrame({"model": [], "rmse": []})

In [18]:
#Stochastic Gradient Descent:
# https://scikit-learn.org/stable/modules/sgd.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor


# eta0 = learning rate
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.01)
sgd_reg.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg.n_iter_}")

results["SGD_preds"] = sgd_reg.predict(X_train)

Number of iterations = 11


In [19]:
#SGD Test RMSE
SGD_test_pred = sgd_reg.predict(X_test)
SGD_test_rmse = np.sqrt(mean_squared_error(y_test, SGD_test_pred))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD", 'rmse': SGD_test_rmse}, index=[0])])

print(f"SGD Test RMSE: {SGD_test_rmse:.3f}")

SGD Test RMSE: 4333.890


## Assessing the impact of L2 regularization on the performance of the SGDRegressor by trying different values for the regularization parameter, alpha (0.0001, 0.001, 0.01)

### 1. For alpha = 0.0001

In [20]:
#Stochastic Gradient with L2 regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_l2_01 = SGDRegressor(max_iter=100, penalty='l2', alpha = 0.0001, eta0=0.01)
_ = sgd_reg_l2_01.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_l2_01.n_iter_}")

results["SGD_preds_l2_01"] = sgd_reg_l2_01.predict(X_train)


Number of iterations = 9


In [21]:
#Train RMSE
#SGD Test RMSE
SGD_test_pred_l2_01 = sgd_reg_l2_01.predict(X_test)
SGD_test_rmse_l2_01 = np.sqrt(mean_squared_error(y_test, SGD_test_pred_l2_01))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD L2 with alpha = 0.0001", 'rmse': SGD_test_rmse_l2_01}, index=[0])])

print(f"SGD Test with l2 RMSE and alpha = 0.0001: {SGD_test_rmse_l2_01:.3f}")

SGD Test with l2 RMSE and alpha = 0.0001: 4350.112


### 2. For alpha = 0.001

In [22]:
#Stochastic Gradient with L2 regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_l2_02 = SGDRegressor(max_iter=100, penalty='l2', alpha = 0.01, eta0=0.01)
_ = sgd_reg_l2_02.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_l2_02.n_iter_}")

results["SGD_preds_l2_02"] = sgd_reg_l2_02.predict(X_train)


Number of iterations = 9


In [23]:
#Train RMSE
#SGD Test RMSE
SGD_test_pred_l2_02 = sgd_reg_l2_02.predict(X_test)
SGD_test_rmse_l2_02 = np.sqrt(mean_squared_error(y_test, SGD_test_pred_l2_02))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD L2 with alpha = 0.001", 'rmse': SGD_test_rmse_l2_02}, index=[0])])

print(f"SGD Test with l2 RMSE and alpha = 0.001: {SGD_test_rmse_l2_02:.3f}")

SGD Test with l2 RMSE and alpha = 0.001: 4673.030


### 3. For alpha = 0.01

In [24]:
#Stochastic Gradient with L2 regularization:

# eta0 = learning rate
# penalty = regularization term
# alphe = regularization strength (lambda)
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg_l2_03 = SGDRegressor(max_iter=100, penalty='l2', alpha = 0.01, eta0=0.01)
_ = sgd_reg_l2_03.fit(X_train, np.ravel(y_train))

print(f"Number of iterations = {sgd_reg_l2_03.n_iter_}")

results["SGD_preds_l2_03"] = sgd_reg_l2_03.predict(X_train)


Number of iterations = 24


In [25]:
#Train RMSE
#SGD Test RMSE
SGD_test_pred_l2_03 = sgd_reg_l2_03.predict(X_test)
SGD_test_rmse_l2_03 = np.sqrt(mean_squared_error(y_test, SGD_test_pred_l2_03))

rmses = pd.concat([rmses, pd.DataFrame({'model':"SGD L2 with alpha = 0.01", 'rmse': SGD_test_rmse_l2_03}, index=[0])])

print(f"SGD Test with l2 RMSE and alpha = 0.01: {SGD_test_rmse_l2_03:.3f}")

SGD Test with l2 RMSE and alpha = 0.01: 4346.933


### RESULTS

In [26]:
rmses.sort_values(by=['rmse'])

Unnamed: 0,model,rmse
0,SGD,4333.889941
0,SGD L2 with alpha = 0.01,4346.932623
0,SGD L2 with alpha = 0.0001,4350.112099
0,SGD L2 with alpha = 0.001,4673.030094


### CONCLUSION

The RMSE values for different variations of Stochastic Gradient Descent (SGD) using L2 regularization allowed us to perform a comprehensive analysis of how well the SGDRegressor performed after incorporating regularization. Let's discuss the observed performance differences compared to the previous implementation without regularization and evaluate and compare the results obtained using each of the three alpha values (0.0001, 0.001, 0.01). We will also analyze the impact of L2 regularization on the model's ability to generalize and control overfitting or underfitting.

1. SGD without regularization (SGD):
   - RMSE: 4333.889941
   
   This model represents the baseline performance of SGD without any regularization. It provides a reference point for evaluating the impact of L2 regularization on the model's performance.

2. SGD with L2 regularization (alpha = 0.0001):
   - RMSE: 4350.112099
   
   With a small alpha value (0.0001), the model shows a slightly higher RMSE compared to the baseline SGD model. This suggests that the regularization has a minor impact on the model's performance. However, it also indicates that the regularization parameter might be too small to have a significant effect on preventing overfitting.

3. SGD with L2 regularization (alpha = 0.001):
   - RMSE: 4673.030094
   
   Increasing the alpha value to 0.001 further impacts the model's performance, resulting in a higher RMSE compared to both the baseline SGD and the previous alpha value (0.0001). This indicates that the regularization is starting to have a noticeable effect, but it might be introducing some underfitting.

4. SGD with L2 regularization (alpha = 0.01):
   - RMSE: 4346.932623
   
   Setting a higher alpha value of 0.01 leads to a similar RMSE as the baseline SGD model. This suggests that the regularization parameter might be too large, causing the model to be overly regularized and lose some predictive power.

Finally, the inclusion of L2 regularization introduced a trade-off between bias and variance in the model. A small alpha value (e.g., 0.0001) has a minimal impact on reducing overfitting, but it also has limited effectiveness in improving performance. On the other hand, a larger alpha value (e.g., 0.001) can lead to underfitting by excessively penalizing model complexity. The optimal alpha value depends on the the specific problem at hand, and it requires careful tuning to achieve the best balance between bias and variance.

L2 regularization helps to control overfitting by adding a penalty term to the model's loss function based on the squared magnitudes of the model's coefficients. This penalty encourages the model to have smaller coefficients, reducing complexity and making it less prone to overfitting. By setting the regularization parameter too high, it can lead to underfitting, as the model is excessively regularized and loses its ability to capture important patterns in the data.

Among the results, it can be observed that the SGD model without L2 regularization (alpha = 0) achieved the lowest RMSE value of 4333.889941. Therefore, the SGD model without regularization performed the best among the models.