# Metrics, fairness, hyperparameter tuning

In [1]:
import pandas as pd

# Quiz grades, final pass grade, free/reduced-price lunch eligibility
df = pd.read_csv('course_passing_data.csv')
df

Unnamed: 0,quiz1,quiz2,quiz3,quiz4,quiz5,quiz6,quiz7,quiz8,quiz9,quiz10,passed_final,frpl
0,65.006812,84.410831,45.160440,70.889405,54.310410,63.434932,60.601137,45.117798,95.797734,93.570121,1,0
1,30.730824,64.917864,14.581339,57.132101,53.962630,21.741871,84.847961,64.160859,86.419329,74.628074,0,1
2,35.885607,75.564349,62.434004,56.603925,63.137982,56.910331,89.608642,71.231343,88.324471,92.695640,0,1
3,59.414426,91.062155,46.457298,54.037249,56.861984,41.587869,77.980740,52.233073,84.232572,90.006458,1,0
4,54.767233,86.944637,31.068615,60.965026,6.832854,46.313645,76.056575,40.544819,89.014045,88.661454,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
395,38.858685,85.199900,90.734316,37.451876,24.785567,55.901637,80.822082,70.993635,78.794029,77.143443,1,1
396,31.218415,79.372650,49.181140,52.887378,47.580652,58.234047,86.362499,69.481944,84.092804,88.170802,0,0
397,41.546260,91.159578,53.578852,57.828196,11.955848,53.583460,0.000000,66.325711,75.683433,80.104594,1,1
398,100.000000,100.000000,52.719078,13.704376,51.123489,41.120479,98.853494,68.902541,82.814987,100.000000,1,0


## Logistic Regression
In linear regression, the outcome is a continuous value, directly modeled as a linear combination of features (e.g., `y = b0 + b1*x1 + b2*x2 + ...`). To handle classification problems, logistic regression modifies linear regression using a logistic (sigmoid) function to constrain the output to the 0-1 range, representing a probability (e.g., `p = 1 / (1 + exp(-(b0 + b1*x1 + b2*x2 + ...)))`). 

In [None]:
#Build a linear regression model
from sklearn import linear_model

features = []
for column in df.columns:
    if column.startswith('quiz'):
        features.append(column)

model = linear_model.LinearRegression()
model.fit(df[features], df.passed_final)
pred = model.predict(df[features])
pd.Series(pred).describe()
# Note how these predictions are not good
# They don't make sense for classification

In [None]:
# Instead we will transform the predictions to be squashed in 0/1
model = linear_model.LogisticRegression()
model.fit(df[features], df.passed_final)
pred = model.predict(df[features])
pd.Series(pred).value_counts()
# Might give a scaling warning since we have not used standardization!
# scikit-learn Pipelines would help streamline ML workflows by allowing scaling and other kinds of transformations 
# to be bundled together with a model. More on this later. 

In [None]:
df.describe()

### Manual scaling

In [None]:
from sklearn import preprocessing

train_df = df.iloc[:300]
test_df = df.iloc[300:]
scaler = preprocessing.StandardScaler()
scaler.fit(train_df[features])
train_X = scaler.transform(train_df[features])
test_X = scaler.transform(test_df[features])
model = linear_model.LogisticRegression()
model.fit(train_X, train_df.passed_final)
pred = model.predict(test_X)
pd.Series(pred).value_counts()

## Pipeline models

In [None]:
from sklearn import pipeline, metrics

pipe = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', linear_model.LogisticRegression())
])
pipe.fit(train_df[features], train_df.passed_final)
preds = pipe.predict(test_df[features])

print('Test kappa:', metrics.cohen_kappa_score(test_df.passed_final, preds))

In [None]:
# Can also use pipelines in cross-validation, same as a model
from sklearn import model_selection

pipe = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', linear_model.LogisticRegression())
])
xval = model_selection.KFold(10, shuffle=True)
scorer = metrics.make_scorer(metrics.cohen_kappa_score)
result = model_selection.cross_validate(
    pipe, df[features], df.passed_final,
    return_train_score=True, cv=xval,
    scoring=scorer)

print('Kappa:', result['test_score'].mean())
print('Kappa SD:', result['test_score'].std())
print('Train kappa mean:', result['train_score'].mean())

## Fairness
Let's take a look at the predictions for different frpl groups (free/reduced-price lunch eligibility)

In [None]:
pipe.fit(train_df[features], train_df.passed_final)
preds = pipe.predict(test_df[features])

output_df = test_df.copy()
output_df['prediction'] = preds
output_df

In [None]:
frpl_rows = output_df[output_df.frpl == 1]
non_frpl_rows = output_df[output_df.frpl == 0]
print('FRPL kappa:', metrics.cohen_kappa_score(frpl_rows.passed_final, frpl_rows.prediction))
print('Non-FRPL kappa:', metrics.cohen_kappa_score(non_frpl_rows.passed_final, non_frpl_rows.prediction))

Seems that the model is more accurate for non-frpl students?
Let's take a closer look...

In [None]:
metrics.confusion_matrix(output_df.passed_final, output_df.prediction)
# NOTE: Scikit-learn sorts labels in the matrix, so for binary classification it goes:
#   TN FP
#   FN TP
# Whereas the conventional order is:
#   [True positive (TP),   False negative (FN)
#    False positive (FP),  True negative (TN) ]

In [None]:
print('FRPL')
print(metrics.confusion_matrix(frpl_rows.passed_final, frpl_rows.prediction))

print('\nNon-FRPL')
print(metrics.confusion_matrix(non_frpl_rows.passed_final, non_frpl_rows.prediction))

## Accuracy metrics


https://scikit-learn.org/stable/api/sklearn.metrics.html#classification-metrics

### Kappa, precision, recall

In [None]:
# NOTE: Scikit-learn sorts labels in the matrix, so for binary classification it goes:
#   TN FP
#   FN TP
# Whereas the conventional order is:
#   [True positive (TP),   False negative (FN)
#    False positive (FP),  True negative (TN) ]

# (57 + 11) / (57 + 3 + 29 + 11)  # prop. correct
print(metrics.confusion_matrix(output_df.passed_final, output_df.prediction))

# 11/(11+29)
print(metrics.recall_score(output_df.passed_final, output_df.prediction))

# 11/(11+3)
print(metrics.precision_score(output_df.passed_final, output_df.prediction))

In [None]:
print(metrics.roc_auc_score(output_df.passed_final, output_df.prediction))
print(metrics.f1_score(output_df.passed_final, output_df.prediction))

The AUC score measures of the ability of the model to distinguish between the classes. 
An AUC of 0.5 suggests no discrimination (i.e., random chance), and an AUC of 1.0 indicates perfect discrimination. 
The model's AUC score of approximately 0.6125 means it is better than random guessing ... but far from perfect.

F1 is the harmonic mean of precision and recall, and is particularly useful when there is an uneven class distribution / class imbalance.
An F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. 
The F1 score of approximately 0.4074 indicates that the model is not very effective in terms of balancing precision and recall, typically suggesting either a low precision or a low recall or both. We have 0.275 recall and 0.786 precision so we know that our model has low recall.

### Adding metrics to a pipeline

In [None]:
pipe = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', linear_model.LogisticRegression())
])
xval = model_selection.KFold(10, shuffle=True)
scorer = {
    'kappa': metrics.make_scorer(metrics.cohen_kappa_score),
    'accuracy': metrics.make_scorer(metrics.accuracy_score),
    'precision': metrics.make_scorer(metrics.precision_score),
    'recall': metrics.make_scorer(metrics.recall_score)
}
result = model_selection.cross_validate(pipe, df[features], df.passed_final, return_train_score=True,
    cv=xval, scoring=scorer)

print('Test kappa:', result['test_kappa'].mean())
print('Test accuracy:', result['test_accuracy'].mean())
print('Test precision:', result['test_precision'].mean())
print('Test recall:', result['test_recall'].mean())

## Hyperparameter tuning

Introducing GridSearchCV 

In [None]:
from sklearn import tree

model = tree.DecisionTreeClassifier()
scorer = metrics.make_scorer(metrics.cohen_kappa_score)
grid = {
    'min_samples_split': [2, 4, 8, 16, 32, 64],
     'max_features': [.1, .5, .75],
}
gridsearch = model_selection.GridSearchCV(model, grid, scoring=scorer)
result = model_selection.cross_validate(
    gridsearch, df[features], df.passed_final,
    return_train_score=True, cv=xval,
    scoring=scorer)

# Sometimes gets unlucky w/low kappa
print('Kappa:', result['test_score'].mean())
print('Kappa SD:', result['test_score'].std())
print('Train kappa:', result['train_score'].mean())

## Using pipelines and hyperparameter tuning together

In [None]:
from sklearn import neighbors

scorer = metrics.make_scorer(metrics.cohen_kappa_score)
model = neighbors.KNeighborsClassifier()
grid = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
}
gridsearch = model_selection.GridSearchCV(model, grid, scoring=scorer)
pipe = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', gridsearch),
])
xval = model_selection.KFold(10, shuffle=True)
result = model_selection.cross_validate(pipe, df[features], df.passed_final,
    return_train_score=True, cv=xval, scoring=scorer)
print('Kappa mean:', result['test_score'].mean())
print('Kappa SD:', result['test_score'].std())
print('Train kappa mean:', result['train_score'].mean())

### Regression w/random forest

In [None]:
from sklearn import ensemble

four_features = ['quiz1', 'quiz2', 'quiz3', 'quiz4']
scorer = metrics.make_scorer(metrics.r2_score)  
model = ensemble.RandomForestRegressor()
grid = {
    'min_samples_leaf': [1, 4, 10],
    'bootstrap': [True, False]
}
gridsearch = model_selection.GridSearchCV(model, grid, scoring=scorer)
result = model_selection.cross_validate(gridsearch, df[four_features], df.quiz10,
    return_train_score=True, cv=xval, scoring=scorer)  # scoring='r2'
print('R^2 mean:', result['test_score'].mean())
print('R^2 SD:', result['test_score'].std())
print('Train R^2 mean:', result['train_score'].mean())

### Also, checkout ROC curves and micro/macro-averaging 
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py 