In [None]:
import pandas as pd
import nltk

# Testing classifiers

The question here is: Given that the performance of classifiers depends on how we randomly divide up our corpus, how can we decide which classifier is doing a better job?

### About overfitting and underfitting

We could often train a classifer that does perfectly on the training set. In that case, however, the classifier would be attending overly to the idiosyncracies of the training set. 

One way this could happen is if we include too many features in our featuresets.

So we want to train in such a way that we pay just the right amount of attention to features in the training set - not too much and not too little.

### Cross-validation

In cross-validation, we shuffle the corpus, then divide it up into even-sized chunks. Then we use each one of the chunks as the test set and we average the results.

<img src="https://scikit-learn.org/stable/_images/grid_search_cross_validation.png" width="442px">

## Create all of the functions we'll need

Mostly, I'm bottling up everything we did above. 

But I'm also adding a wrapper that does the cross-validation.

In [None]:
import csv
import random
import sklearn
from sklearn.metrics import cohen_kappa_score

# This reads in the raw data, shuffles it, and crates the feature sets.
def prepare_feature_sets(pass_feature_func):
    csvfile = open("corpora/titanic/train.csv")
    dlist = list(csv.DictReader(csvfile))
    random.shuffle(dlist)
    labeled_feature_sets = [(pass_feature_func(r), r["Survived"]) for r in dlist]
    return labeled_feature_sets

# This trains and tests a classifier given the training set and the test set.
def test_classifier(train_set, test_set):
    classif = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(classif, test_set)
    gold_list = [t[1] for t in test_set]
    test_list = [classif.classify(t[0]) for t in test_set]
    kappa = cohen_kappa_score(gold_list, test_list)
    return {"accuracy": accuracy, "kappa": kappa}

# This does the who cross validation thing.
def cross_validate(nchunks, labeled_featuresets):
    test_fraction = 1 / nchunks
    test_size = round(test_fraction * len(labeled_featuresets))
    chunk_starts = [n * test_size for n in range(nchunks)]
    random.shuffle(labeled_featuresets)
    data_chunks = [labeled_featuresets[start:start+test_size] for start in chunk_starts]
    test_set = None
    train_set = []
    results = []
    for n in range(nchunks):
        for m, c in enumerate(data_chunks):
            if m == n:
                test_set = c
            else:
                train_set += c
        res = test_classifier(train_set, test_set)
        # print(res)
        results.append(res)
    average_accuracy = sum([res["accuracy"] for res in results]) / len(results)
    average_kappa = sum([res["kappa"] for res in results]) / len(results)
    print("average accuracy: {}, average kappa: {}".format(round(average_accuracy, 3), round(average_kappa, 3)))
    
    return results, average_accuracy, average_kappa

Now we can systematically compare the different feature sets.

In [None]:
def passenger_features(r):
    return {"sex": r["Sex"], "pclass": r["Pclass"], "embarked": r["Embarked"]}

results, average_accuracy, average_kappa = cross_validate(10, prepare_feature_sets(passenger_features))

In [None]:
def sex_features(r):
    return {"sex": r["Sex"]}
results, average_accuracy, average_kappa = cross_validate(10, prepare_feature_sets(sex_features))

In [None]:
def passenger_features(r):
    return {"sex": r["Sex"], "pclass": r["Pclass"]}

results, average_accuracy, average_kappa = cross_validate(10, prepare_feature_sets(passenger_features))