In [2]:
import pandas as pd
from sklearn import linear_model, model_selection, pipeline, preprocessing, metrics

# Read in dataset
df = pd.read_csv('dataset-email.csv')

# Set up cross-validation; make sure to shuffle folds
xval = model_selection.KFold(10, shuffle=True)

# Set up scoring with multiple metrics
scorer = {
    'kappa': metrics.make_scorer(metrics.cohen_kappa_score),
    'accuracy': metrics.make_scorer(metrics.accuracy_score),
    'precision': metrics.make_scorer(metrics.precision_score),
    'recall': metrics.make_scorer(metrics.recall_score)
}

# Create the pipeline model
pipe = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', linear_model.LogisticRegression())
])

features = df.drop(columns=['spam']).columns
features
# Train and test model
result = model_selection.cross_validate(pipe, df[features], df.spam, return_train_score=True,
    cv=xval, scoring=scorer, return_estimator=True)

# Calculate metrics for testing data
test_kappa = result['test_kappa'].mean()
test_kappa_sd = result['test_kappa'].std()
test_precision = result['test_precision'].mean()
test_precision_sd = result['test_precision'].std()
test_recall = result['test_recall'].mean()
test_recall_sd = result['test_recall'].std()