# Model Validation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR

We will use the function below for the remainder of the semester to perform K-fold cross-validation:

In [None]:
def do_Kfold(model,X,y,k,scaler = None, random_state = 146):
    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits=k, random_state = random_state, shuffle=True)

    train_scores = []
    test_scores = []

    for idxTrain, idxTest in kf.split(X):
        Xtrain = X[idxTrain, :]
        Xtest = X[idxTest, :]
        ytrain = y[idxTrain]
        ytest = y[idxTest]
        if scaler != None:
            Xtrain = scaler.fit_transform(Xtrain)
            Xtest = scaler.transform(Xtest)

        model.fit(Xtrain,ytrain)

        train_scores.append(model.score(Xtrain,ytrain))
        test_scores.append(model.score(Xtest,ytest))
        
    return train_scores, test_scores

## Concrete data

In [None]:
concrete = pd.read_excel('./data/Concrete_Data.xls')
concrete.head()

In [None]:
concrete.columns = [item.split('(')[0].rstrip().replace(' ','_') for item in concrete.columns]

In [None]:
concrete.head()

In [None]:
X =concrete.drop(columns = 'Concrete_compressive_strength')
y = concrete['Concrete_compressive_strength']

In [None]:
from sklearn.model_selection import train_test_split as tts

Xtrain,Xtest,ytrain,ytest = tts(X,y,test_size=0.4, shuffle=True, random_state = 146)
lin_reg = LR()
lin_reg.fit(Xtrain,ytrain)

print(f'Internal validity (R^2) : {lin_reg.score(Xtrain,ytrain):.2f}')
print(f'External validity (R^2) : {lin_reg.score(Xtest,ytest):.2f}')

In [None]:
Xtrain,Xtest,ytrain,ytest = tts(X,y,test_size=0.4, shuffle=True, random_state = 12)
lin_reg.fit(Xtrain,ytrain)

print(f'Internal validity (R^2) : {lin_reg.score(Xtrain,ytrain):.2f}')
print(f'External validity (R^2) : {lin_reg.score(Xtest,ytest):.2f}')

In [None]:
#Use fxn to do kfold validation
lin_reg = LR()


In [None]:
min_r2 = min(min(test_scores), min(train_scores))
max_r2 = 1

n_bins = 15
my_bins = np.linspace(min_r2, max_r2, n_bins+1)

plt.hist(train_scores, label='Training Scores', color='blue', bins=my_bins, alpha=0.5, rwidth=0.95)
plt.hist(test_scores, label='Testing Scores', color='red', bins=my_bins, alpha=0.5, rwidth=0.95)
plt.legend()
plt.show()
print('Average for train:', format(np.mean(train_scores),'.2f'))
print('Average for test:', format(np.mean (test_scores), '.2f'))

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(train_scores, test_scores, alpha=0.5, ec='k', s = 60)
plt.xlabel('Training Score', fontsize=14)
plt.ylabel('Testing Score', fontsize=14)
plt.xlim([0,1])
#plt.ylim([min(test_scores), 1])
plt.show()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize = [8,4])
scores = pd.DataFrame(columns = ['Train','Test'])
scores['Train'] = train_scores
scores['Test'] = test_scores
ax = sns.stripplot(data = scores, orient = 'h', s = 10, alpha = 0.7, ec = 'k', 
                   palette=['slategrey','cornflowerblue'])
[ax.spines[i].set_visible(False) for i in ax.spines]
plt.grid(axis = 'x', linestyle = '--', color = 'lightgrey')
plt.tick_params(labelsize = 14)
plt.xlabel('$R^2$', fontsize = 16, labelpad = 20)
plt.show()