## <font color = darkblue> This is an example of SGDRegressor and SGDRegressor
    - Both use Nutrition data
    - This demonstration shows how to tune hyperameters 

In [1]:
# ------------------------------------
# Importing the necessary libraries
# ------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
import seaborn as sns
import warnings
import sklearn.metrics

from datetime import datetime
warnings.filterwarnings('ignore') 
sns.set(rc={'figure.figsize':(11,8)})
pd.options.display.float_format = '{:.2f}'.format

## Importing nutrition data

In [2]:
%cd "/Volumes/LACIE SHARE/Courses/Roux /Machine Learning/Data"
nut = pd.read_csv("Final_Nutrition.csv")


# ---------------------------------------------------------------------------------
# Keeping only first 15 features for demonstration purposes, and no other reason
# Also dropping  NAs
# ---------------------------------------------------------------------------------

nut = nut.iloc[:,range(15)].dropna()

print(nut.shape)

nut.head()

[Errno 2] No such file or directory: '/Volumes/LACIE SHARE/Courses/Roux /Machine Learning/Data'
/Users/ZongyuWu/PycharmProjects/CS6140
(6080, 15)


Unnamed: 0,NDB_No,Shrt_Desc,Long_Desc,FdGrp_Desc,Water_g,Energ_Kcal,Protein_g,Lipid_Tot_g,Carbohydrt_g,Fiber_TD_g,Sugar_Tot_g,Calcium_mg,Iron_mg,Magnesium_mg,Phosphorus_mg
0,1001,"BUTTER,WITH SALT","Butter, salted",Dairy and Egg Products,15.87,717,0.85,81.11,0.06,0.0,0.06,24.0,0.02,2.0,24.0
1,1002,"BUTTER,WHIPPED,W/ SALT","Butter, whipped, with salt",Dairy and Egg Products,16.72,718,0.49,78.3,2.87,0.0,0.06,23.0,0.05,1.0,24.0
2,1003,"BUTTER OIL,ANHYDROUS","Butter oil, anhydrous",Dairy and Egg Products,0.24,876,0.28,99.48,0.0,0.0,0.0,4.0,0.0,0.0,3.0
3,1004,"CHEESE,BLUE","Cheese, blue",Dairy and Egg Products,42.41,353,21.4,28.74,2.34,0.0,0.5,528.0,0.31,23.0,387.0
4,1005,"CHEESE,BRICK","Cheese, brick",Dairy and Egg Products,41.11,371,23.24,29.68,2.79,0.0,0.51,674.0,0.43,24.0,451.0


In [3]:
# --------------------------------------------------------------------------------
# # Separating the features and target
# Notice no attention is paid to Test-Train separation. Consider that step as an 
# intergal part of ML pipeline, and should not be skipped
# For simplicity, only a 1000 rows are used
# --------------------------------------------------------------------------------

X = nut.iloc[0:1000,[6,7,8]]
y = nut['Energ_Kcal'][0:1000]


# # ----------------------------------------------------------------------
# # # # Feature Standardization
# # This is a required step, as recommended by sklearn
# # ----------------------------------------------------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)



# --------------------------
# # # Training the model
# --------------------------
model = SGDRegressor(max_iter=100, 
                     tol = 0.0001,
                     early_stopping=False, warm_start=False,
                     n_iter_no_change = 5)
model.fit(X,y)

    
# -------
# Predict    
# -------
y_pred = model.predict(X)

# ----------------------
# Evaluating using MSE
# Using sklearn's built in function
# ----------------------

from sklearn.metrics import mean_squared_error
print(f"MSE is {mean_squared_error(y, y_pred)}")

# --------------------------------------------
# Alternatively writing the formula
# --------------------------------------------

mse = np.mean((y - y_pred)**2)




print(f"Number of adjustments in weights = {model.t_}, Coefficients are {model.coef_}, Number of iterations=  {model.n_iter_} and the R2 score is {model.score(X,y)}")
print("\nNote that the coefficients are for the standardized data and not on the original scale\n")
    
    
# --------------------------------------------------------------------------------
# Repeating the above step 20 times
# The purpose is to illustrate how the results are similar but not exactly the same 
# This is due to the stochastic nature of the process
# --------------------------------------------------------------------------------
    
for i in range(1,20):
    # # # Training the model
    model1 = SGDRegressor(max_iter=100, #tol=None, 
                         tol = 0.0001,
                         early_stopping=False, warm_start=False,
                         n_iter_no_change = 5)
    model1.fit(X,y)

    
    
 
    # # # Making predictions
    y_pred = model1.predict(X)

    # # # Evaluating the model
    mse = np.mean((y - y_pred)**2)
    print("-----------")
    print(model1.t_,model1.coef_, "{:.3f}".format(model1.score(X,y)), model1.n_iter_)


MSE is 213.25082828976738
Number of adjustments in weights = 12001.0, Coefficients are [ 48.16033623 253.67507277  67.79826629], Number of iterations=  12 and the R2 score is 0.9963564017028033

Note that the coefficients are for the standardized data and not on the original scale

-----------
30001.0 [ 48.22798122 253.59801307  67.74949403] 0.996 30
-----------
17001.0 [ 48.07889747 253.81126047  67.69073979] 0.996 17
-----------
15001.0 [ 48.15319313 253.65344147  67.83442312] 0.996 15
-----------
17001.0 [ 48.12792181 253.62094828  67.73166279] 0.996 17
-----------
22001.0 [ 48.20841048 253.59254339  67.63291751] 0.996 22
-----------
17001.0 [ 48.20667505 253.60423612  67.73384575] 0.996 17
-----------
20001.0 [ 48.14771603 253.62272575  67.71421772] 0.996 20
-----------
26001.0 [ 48.15700255 253.71738584  67.6580808 ] 0.996 26
-----------
24001.0 [ 48.17271412 253.77801213  67.65887794] 0.996 24
-----------
14001.0 [ 48.1329731  253.70880486  67.65045327] 0.996 14
-----------
19001

In [4]:
# ---------------------
# Comparison with MLR
# ---------------------

# from sklearn.linear_model import LinearRegression

# --------------------------------------------------------------------------------
# Creating X and y again, and this time not standardizing them
# --------------------------------------------------------------------------------

X = nut.iloc[0:1000,[6,7,8]]
y = nut['Energ_Kcal'][0:1000]
model_comp = LinearRegression().fit(X, y)

y_pred_comp = model_comp.predict(X)
mse = np.mean((y - y_pred_comp)**2)
print(model_comp.coef_, "{:.3f}".format(model_comp.score(X,y)))


NameError: name 'LinearRegression' is not defined

## <font  color = darkblue> Example of Grid Search
    - Grid Search is used to find the most optimal setting of hyperparameters
    - This is done by providing various values of hyperparameters to the model
    - And then observing which setting provides the best results in terms of R-sq or MSE

In [None]:

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor


X = nut.iloc[0:1000,6:]
y = nut['Energ_Kcal'][0:1000]


# # ----------------------------------------------------------------------
# # # # Feature Standardization
# # This is a required step, as recommended by sklearn
# # ----------------------------------------------------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Create the hyperparameter grid
param_grid = {
              'tol':[0.0001, 0.001, 0.1],
             'max_iter':[50,100,1000],
             'n_iter_no_change': [10,50,100]}

# Create the SGDRegressor
sgd_reg = SGDRegressor(alpha = 0.01,
                       early_stopping=True,
                       warm_start=True,
                       l1_ratio=0.15,
                       eta0 = 0.001)

# Create the GridSearchCV object
sgd_cv = GridSearchCV(sgd_reg, param_grid, cv=5)

# Fit it to the data
sgd_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Parameters: {}".format(sgd_cv.best_params_)) 
print("Best score is {}".format(sgd_cv.best_score_))


# -------------
# Single run
# -------------

model4 = SGDRegressor(max_iter=1000, #tol=None, 
                      alpha = 0.01,
                     tol = 0.1,
                     l1_ratio=0.15,
                     early_stopping=True, warm_start=True,
                     eta0 = 0.001,
                     n_iter_no_change = 100)
model4.fit(X,y)
model4.score(X,y)



## SGDClassifier

In [None]:
# ----------------------------------------
#importing necessary libraries
# ----------------------------------------

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import sklearn.metrics

X = nut.iloc[0:1000,8:]
y = nut['Energ_Kcal'][0:1000]

# High when calories > 160, else Low
y_cat = np.where(y<160, 0,1)

# # # ----------------------------------------------------------------------
# # # # # Feature Standardization
# # # This is a required step, as recommended by sklearn
# # # ----------------------------------------------------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)


#splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size = 0.2, random_state=42)

#creating and fitting the SGDClassifier
clf = SGDClassifier(max_iter=1000, tol=1e-3, loss = 'log')
clf.fit(X_train, y_train)

#predicting the test set results
y_pred = clf.predict(X_test)

#calculating the accuracy of the model
accuracy = clf.score(X_test, y_test)

print("Accuracy: {:.2f}".format(accuracy))

In [5]:
# help(sklearn.metrics)
# ------------------------------
# Various classification metrics
# ------------------------------

print(f"Overall accuracy is {sklearn.metrics.accuracy_score(y_pred, y_test)}")
print(f"Precision or TP/(TP + FP) is {sklearn.metrics.precision_score(y_pred, y_test)}")
print(f"Recall or TP / (TP + FN) is {sklearn.metrics.recall_score(y_pred, y_test)}")
print(f"F1-score or 2*Precision*Recall / (Precision + Recall)  is {sklearn.metrics.f1_score(y_pred, y_test)}")
print(f"Confusion Matrix \n{sklearn.metrics.confusion_matrix(y_pred, y_test)}")


NameError: name 'y_test' is not defined

In [None]:
# -----------------------------------
# Changing the decision threhold
# -----------------------------------

#predicting the test set results
y_pred = clf.predict(X_test)

# # predict the decision scores for the data
decision_scores = clf.decision_function(X_test)

# # change the decision threshold to 0.6
y_pred_new = (decision_scores > 0.6).astype(int)

# # print the accuracy of the new predictions
print("Accuracy with threshold of 0.6:", sum(y_pred_new == y_test) / len(y_test))
