In [None]:
import pandas as pd
import nltk

## This is the same procedures we defined in the last notebook

In [None]:
import csv
import random
import sklearn
from sklearn.metrics import cohen_kappa_score

def prepare_feature_sets(pass_feature_func):
    csvfile = open("corpora/titanic/train.csv")
    dlist = list(csv.DictReader(csvfile))
    random.shuffle(dlist)
    labeled_feature_sets = [(pass_feature_func(r), r["Survived"]) for r in dlist]
    return labeled_feature_sets

def test_classifier(train_set, test_set):
    classif = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(classif, test_set)
    gold_list = [t[1] for t in test_set]
    test_list = [classif.classify(t[0]) for t in test_set]
    kappa = cohen_kappa_score(gold_list, test_list)
    return {"accuracy": accuracy, "kappa": kappa}

def cross_validate(nchunks, labeled_featuresets):
    test_fraction = 1 / nchunks
    test_size = round(test_fraction * len(labeled_featuresets))
    chunk_starts = [n * test_size for n in range(nchunks)]
    random.shuffle(labeled_featuresets)
    data_chunks = [labeled_featuresets[start:start+test_size] for start in chunk_starts]
    test_set = None
    train_set = []
    results = []
    for n in range(nchunks):
        for m, c in enumerate(data_chunks):
            if m == n:
                test_set = c
            else:
                train_set += c
        res = test_classifier(train_set, test_set)
        results.append(res)
    average_accuracy = sum([res["accuracy"] for res in results]) / len(results)
    average_kappa = sum([res["kappa"] for res in results]) / len(results)
    print("average accuracy: {}, average kappa: {}".format(round(average_accuracy, 3), round(average_kappa, 3)))
    
    return results, average_accuracy, average_kappa

In [None]:
def passenger_features(r):
    return {"sex": r["Sex"], "pclass": r["Pclass"], "embarked": r["Embarked"]}

results, average_accuracy, average_kappa = cross_validate(10, prepare_feature_sets(passenger_features))

## Classification algorithms

In general, there are two ways we can make our classifiers better. One is with "feature engineering," by doing better at the feature sets that we feed into an algorithm. The second is by choosing the right algorithm. The Naive Bayes algorithm is just one of the many algorithms that exist. 

Once we have set up the machinery above, using a different algorithm can often be very easy. To use the decision tree algorithm we just have to change the first line in our `test_classifier` procedure.

### Decision Tree Classifiers

In [None]:
def test_classifier(train_set, test_set):
    classif = nltk.DecisionTreeClassifier.train(train_set)  # This is key line
    print(classif.pseudocode(depth=4))
    accuracy = nltk.classify.accuracy(classif, test_set)
    gold_list = [t[1] for t in test_set]
    test_list = [classif.classify(t[0]) for t in test_set]
    kappa = cohen_kappa_score(gold_list, test_list)
    return {"accuracy": accuracy, "kappa": kappa}

In [None]:
def passenger_features(r):
    return {"sex": r["Sex"], "pclass": r["Pclass"], "embarked": r["Embarked"]}

results, average_accuracy, average_kappa = cross_validate(10, prepare_feature_sets(passenger_features))

## Support Vector Machine Classifiers

NLTK has a limited number of classification algorithms built in. But it has functionality that
makes it relatively easy to use classifiers from other libraries. 

In particular, it can connect to [scikit-learn](https://scikit-learn.org/stable/supervised_learning.html), often abbreviated *sklearn*.

Here we'll use the **Support Vector Machine** algorithm from sklearn. Again, this requires changing the first line in `test_classifer`. We also have to import the needed machinery from NLTK and sklearn.

In [None]:
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC

def test_classifier(train_set, test_set):
    classif = SklearnClassifier(SVC(), sparse=False).train(train_set)
    accuracy = nltk.classify.accuracy(classif, test_set)
    gold_list = [t[1] for t in test_set]
    test_list = [classif.classify(t[0]) for t in test_set]
    kappa = cohen_kappa_score(gold_list, test_list)
    return {"accuracy": accuracy, "kappa": kappa}

In [None]:
def passenger_features(r):
    return {"sex": r["Sex"], "pclass": r["Pclass"], "embarked": r["Embarked"]}

results, average_accuracy, average_kappa = cross_validate(10, prepare_feature_sets(passenger_features))