In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def do_Kfold(model,X,y,k,scaler = None, random_state = 146):
    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits=k, random_state = random_state, shuffle=True)

    train_scores = []
    test_scores = []

    for idxTrain, idxTest in kf.split(X):
        Xtrain = X[idxTrain, :]
        Xtest = X[idxTest, :]
        ytrain = y[idxTrain]
        ytest = y[idxTest]
        if scaler != None:
            Xtrain = scaler.fit_transform(Xtrain)
            Xtest = scaler.transform(Xtest)

        model.fit(Xtrain,ytrain)

        train_scores.append(model.score(Xtrain,ytrain))
        test_scores.append(model.score(Xtest,ytest))
        
    return train_scores, test_scores

In [None]:
data = pd.read_csv('Lake_Data.txt')

In [None]:
data.head() # you should always look at your data!!!

In [None]:
X_df = data.drop(columns = ['Number', 'Latitude', 'Longitude', 'MicrocystisDicot', 'Status','Chl.a'])
X = X_df.values
X_names = X_df.columns

We will use the other variables to try and predict Chlorophyll A. Why? Maybe it's kind of hard to measure and  in the future, we'd just like to be able to predict and get a good estimate.

In [None]:
y = data['Chl.a'].values

In [None]:
y.shape

# Univariate exploration

In [None]:
X_df.isnull().sum()

In [None]:
X_df.describe()

Let's plot our data on a box plot and a strip chart...what's the difference?

Let's make a histogram of the Total Nitrogen to Total Phosphorus ratio. How would you describe this? What summary statistics would it be appropriate to report? Where do you think the mean is with respect to the median?

We will look at the variance of all of our predictors. Does that tell us which really vary the most around their mean?

# Bivariate exploration

In [None]:
total_df = X_df.copy()
total_df['target'] = y

Let's look at correlations between the predictors and also between the predictors and the target:

# Regression Modeling with k-fold validation

In [None]:
from sklearn.linear_model import LinearRegression as LR, Ridge, Lasso
from sklearn.preprocessing import StandardScaler as SS

First, we will do 5-fold validation with OLS, and then will compare that to 5 fold validation with Ridge and Lasso.

In [None]:
#k-fold OLS


In [None]:
results = pd.DataFrame(columns = ['Train','Test'])
results['Train'] = tr
results['Test'] = te
ax = sns.stripplot(data = results, orient = 'h', s = 10, alpha = 0.8, ec = 'k', palette = 'winter')
plt.grid(axis = 'x', color = 'lightgrey', linestyle ='--')
plt.tick_params(labelsize = 14)
plt.xlim(0,1)
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.show()

In [None]:
print(np.mean(tr), np.mean(te))

In [None]:
np.var(tr),np.var(te)

How would you describe these results?

## Ridge regression

What is the purpose of the code below? Do we need to change anything?

In [None]:
a_range = np.linspace(20,50,100) #start 10 to 20 and change - optimal value is ~37
k = 5
ss = SS()

avg_tr_score=[]
avg_te_score=[]

for a in a_range:
    rid_reg = Ridge(alpha=a)
    train_scores, test_scores = do_Kfold(rid_reg, X, y, k, SS())
    
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))

In [None]:
plt.figure(figsize=(8,8))
#plt.plot(a_range, avg_tr_score, color='k', label='Training')
plt.plot(a_range, avg_te_score, color='r', label='Testing')
plt.xlabel('$\\alpha$', fontsize=14)
plt.ylabel('Avg. $R^2$', fontsize=14)
plt.legend()
plt.show()
idx_max = np.argmax(avg_te_score)
#Can you read the y axis properly?

print('Optimal alpha in the range tested: ', a_range[idx_max])
print('Avg. training score at this value: ', avg_tr_score[idx_max])
print('Avg. testing score at this value: ', avg_te_score[idx_max])

In [None]:
#do k fold with ridge


In [None]:
results = pd.DataFrame(columns = ['Train','Test'])
results['Train'] = tr
results['Test'] = te
ax = sns.stripplot(data = results, orient = 'h', s = 10, alpha = 0.8, ec = 'k', palette = 'winter')
plt.grid(axis = 'x', color = 'lightgrey', linestyle ='--')
plt.tick_params(labelsize = 14)
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.xlim(0,1)
plt.show()

In [None]:
print(np.mean(tr), np.mean(te))

In [None]:
np.var(tr),np.var(te)

How do these results compare to OLS? (We can look on a slide)

## Lasso Regression

What is the purpose of the code below? Do we need to change anything?

In [None]:
a_range = np.linspace(1,10,100)
k = 5
ss = SS()

avg_tr_score=[]
avg_te_score=[]

for a in a_range:
    las_reg = Lasso(alpha=a, max_iter = 10000)
    train_scores, test_scores = do_Kfold(las_reg, X, y, k, SS())
    
    avg_tr_score.append(np.mean(train_scores))
    avg_te_score.append(np.mean(test_scores))

In [None]:
plt.figure(figsize=(8,8))
#plt.plot(a_range, avg_tr_score, color='k', label='Training')
plt.plot(a_range, avg_te_score, color='r', label='Testing')
plt.xlabel('$\\alpha$', fontsize=14)
plt.ylabel('Avg. $R^2$', fontsize=14)
plt.legend()
plt.show()
idx_max = np.argmax(avg_te_score)

print('Optimal alpha in the range tested: ', a_range[idx_max])
print('Avg. training score at this value: ', avg_tr_score[idx_max])
print('Avg. testing score at this value: ', avg_te_score[idx_max])

In [None]:
#k fold with lasso


In [None]:
results = pd.DataFrame(columns = ['Train','Test'])
results['Train'] = tr
results['Test'] = te
ax = sns.stripplot(data = results, orient = 'h', s = 10, alpha = 0.8, ec = 'k', palette = 'winter')
plt.grid(axis = 'x', color = 'lightgrey', linestyle ='--')
plt.tick_params(labelsize = 14)
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.xlim(0,1)
plt.show()

In [None]:
print(np.mean(tr), np.mean(te))

In [None]:
np.var(tr),np.var(te)

How do these results compare? Let's look on a slide.

# One tts

In [None]:
from sklearn.model_selection import train_test_split as tts

In [None]:
#train_size of 0.6, random_state 99

Should we standardize the data? If yes, how do we do it?

In [None]:
#OLS
print(f'Train: {lin_reg.score(Xtrain_s,ytrain):.4f}, Test:{lin_reg.score(Xtest_s, ytest):.4f}')

In [None]:
#Ridge

print(f'Train: {rid_reg.score(Xtrain_s,ytrain):.4f}, Test:{rid_reg.score(Xtest_s, ytest):.4f}')

In [None]:
#Lasso

print(f'Train: {las_reg.score(Xtrain_s,ytrain):.4f}, Test:{las_reg.score(Xtest_s, ytest):.4f}')

In [None]:
coeffs = pd.DataFrame(columns = ['OLS', 'Ridge','Lasso'], index = X_df.columns)

In [None]:
coeffs['OLS'] = lin_reg.coef_
coeffs['Ridge'] = rid_reg.coef_
coeffs['Lasso'] = las_reg.coef_

In [None]:
coeffs

So what's actually important for predicting Chlorophyll A? Does it make sense?

### Ahem

In [None]:
X_new = X_df[[#which predictors]].values

In [None]:
Xtrain_new,Xtest_new,ytrain_new,ytest_new = #tts

In [None]:
#fit OLS....do we have to scale?