In [None]:
# from helper.functions import get_dataset_from_json
# X_dict, y_str = get_dataset_from_json("featuresQuixote_cohesive.json")

In [1]:
import nltk
import json
import numpy as np
import random
from random import shuffle
random.seed(42)

In [2]:
def load_dataset_nltk(json_filename):
    FOLDER = ".\\auxfiles\\json\\"
    with open(FOLDER + json_filename, "r") as f:
        data_text = f.read()
    return json.loads(data_text)

In [5]:
def run_experiment_nltk(filename, folds=2):
    dataset = load_dataset_nltk(filename)
    shuffle(dataset)
    if folds > 0:
        cut = len(dataset)//folds
    else:
        print("folds must be > 0")
        return None
    train_set, test_set = dataset[cut:], dataset[:cut]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    keys = [
        "classifier",
        "accuracy",
    ]
    vals = [
        classifier,
        acc,
    ]
    return dict(zip(keys, vals))

# Quixote

## First experiment

Cohesive markers

In [38]:
filename1 = "featuresQuixote_cohesive_tf.json"

experiment1 = [run_experiment_nltk(filename1, folds=10) for _ in range(1)]

for run in experiment1:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 1.0


In [39]:
accs = np.array([run["accuracy"] for run in experiment1])
print(accs.mean(), accs.var())

1.0 0.0


In [40]:
classifiers1 = [run["classifier"] for run in experiment1]

for classifier in classifiers1:
    print(classifier.show_most_informative_features())

Most Informative Features
                     yet = None           Ormsby : Shelto =     22.1 : 1.0
                 however = None           Shelto : Ormsby =      4.2 : 1.0
                   since = None           Ormsby : Jarvis =      3.9 : 1.0
                in short = None           Shelto : Jarvis =      3.7 : 1.0
                although = None           Jarvis : Shelto =      3.1 : 1.0
               therefore = None           Ormsby : Jarvis =      2.8 : 1.0
                     now = None           Shelto : Ormsby =      2.7 : 1.0
                likewise = None           Ormsby : Shelto =      2.4 : 1.0
                   there = None           Jarvis : Ormsby =      2.3 : 1.0
                    thus = None           Ormsby : Shelto =      2.2 : 1.0
None


## Second Experiment

Cohesive markers and punctuation

In [70]:
filename2 = "featuresQuixote_cohesive_punctuation_tfidf.json"

experiment2 = [run_experiment_nltk(filename2, folds=2) for i in range(1)]

for run in experiment2:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 1.0


In [71]:
accs = np.array([run["accuracy"] for run in experiment2])
print(accs.mean(), accs.var())

1.0 0.0


In [72]:
classifiers2 = [run["classifier"] for run in experiment2]

for classifier in classifiers2:
    print(classifier.show_most_informative_features())

Most Informative Features
                     but = None           Ormsby : Shelto =      7.4 : 1.0
                   , yet = None           Ormsby : Shelto =      6.5 : 1.0
                  ; and, = None           Ormsby : Jarvis =      6.5 : 1.0
                  , and, = None           Ormsby : Shelto =      4.7 : 1.0
                   . and = None           Ormsby : Shelto =      4.1 : 1.0
                   ; but = None           Ormsby : Jarvis =      3.9 : 1.0
                    'and = None           Jarvis : Shelto =      3.9 : 1.0
                    "and = None           Shelto : Ormsby =      3.7 : 1.0
                    'but = None           Jarvis : Shelto =      3.1 : 1.0
                    here = None           Jarvis : Shelto =      2.9 : 1.0
None


## Third Experiment

Plain unigrams

In [67]:
filename = "featuresQuixote_unigrams.json"

experiment = [run_experiment_nltk(filename, folds=2) for _ in range(1)]

for run in experiment:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 0.8783068783068783


In [68]:
accs = np.array([run["accuracy"] for run in experiment])
print(accs.mean(), accs.var())

0.8783068783068783 0.0


In [69]:
classifiers = [run["classifier"] for run in experiment]

for classifier in classifiers:
    print(classifier.show_most_informative_features())

Most Informative Features
                    hath = None           Jarvis : Shelto =     37.3 : 1.0
                     has = None           Shelto : Ormsby =     25.4 : 1.0
                 replied = None           Shelto : Ormsby =     21.5 : 1.0
                     yet = None           Ormsby : Shelto =     21.1 : 1.0
                scarcely = 1              Jarvis : Shelto =     14.3 : 1.0
                   quoth = None           Ormsby : Shelto =     13.6 : 1.0
               exclaimed = 1              Ormsby : Jarvis =     12.4 : 1.0
                  whilst = 1              Shelto : Ormsby =     11.3 : 1.0
                   about = None           Shelto : Ormsby =     11.3 : 1.0
                   guess = 1              Jarvis : Ormsby =     10.8 : 1.0
None


In [76]:
filename = "featuresQuixote_trigrams_pos_punct_tfidf.json"

experiment = [run_experiment_nltk(filename, folds=5) for _ in range(1)]

for run in experiment:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 1.0


In [77]:
accs = np.array([run["accuracy"] for run in experiment])
print(accs.mean(), accs.var())

1.0 0.0


In [78]:
classifiers = [run["classifier"] for run in experiment]

for classifier in classifiers:
    print(classifier.show_most_informative_features())

Most Informative Features
                NOUN . " = None           Shelto : Ormsby =     48.1 : 1.0
                , " VERB = None           Shelto : Ormsby =     31.3 : 1.0
         NOUN CCONJ VERB = None           Jarvis : Ormsby =     16.6 : 1.0
                , ' VERB = None           Jarvis : Shelto =     16.5 : 1.0
               PROPN , " = None           Shelto : Ormsby =     15.4 : 1.0
            " VERB PROPN = None           Shelto : Ormsby =     14.5 : 1.0
                , " PRON = None           Shelto : Ormsby =     12.8 : 1.0
                   . " " = None           Shelto : Ormsby =     12.0 : 1.0
              ; ADP PRON = None           Ormsby : Jarvis =     11.3 : 1.0
              " PRON AUX = None           Shelto : Jarvis =     11.1 : 1.0
None


## Fourth Experiment

Plain bigrams

In [14]:
filename = "featuresQuixote_bigrams.json"

experiment = [run_experiment_nltk(filename, folds=5) for _ in range(20)]

for run in experiment:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 0.7466666666666667
Accuracy: 0.6133333333333333
Accuracy: 0.72
Accuracy: 0.72
Accuracy: 0.68
Accuracy: 0.6933333333333334
Accuracy: 0.6533333333333333
Accuracy: 0.6933333333333334
Accuracy: 0.7733333333333333
Accuracy: 0.6933333333333334
Accuracy: 0.7333333333333333
Accuracy: 0.6666666666666666
Accuracy: 0.68
Accuracy: 0.6933333333333334
Accuracy: 0.64
Accuracy: 0.64
Accuracy: 0.6666666666666666
Accuracy: 0.64
Accuracy: 0.72
Accuracy: 0.6533333333333333


In [15]:
accs = np.array([run["accuracy"] for run in experiment])
print(accs.mean(), accs.var())

0.6859999999999999 0.0015906666666666667


In [None]:
classifiers = [run["classifier"] for run in experiment]

for classifier in classifiers:
    print(classifier.show_most_informative_features())

In [39]:
filename = "featuresQuixote_bigrams_punct.json"

experiment = [run_experiment_nltk(filename, folds=5) for _ in range(20)]

for run in experiment:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 0.9066666666666666
Accuracy: 0.8666666666666667
Accuracy: 0.9066666666666666
Accuracy: 0.9066666666666666
Accuracy: 0.88
Accuracy: 0.92
Accuracy: 0.88
Accuracy: 0.92
Accuracy: 0.9333333333333333
Accuracy: 0.9466666666666667
Accuracy: 0.8933333333333333
Accuracy: 0.9466666666666667
Accuracy: 0.8933333333333333
Accuracy: 0.92
Accuracy: 0.8666666666666667
Accuracy: 0.92
Accuracy: 0.8
Accuracy: 0.96
Accuracy: 0.8933333333333333
Accuracy: 0.88


In [40]:
accs = np.array([run["accuracy"] for run in experiment])
print(accs.mean(), accs.var())

0.9020000000000001 0.0012137777777777772


## Fifth Experiment
Plain trigrams

In [41]:
filename = "featuresQuixote_trigrams.json"

experiment = [run_experiment_nltk(filename, folds=5) for _ in range(20)]

for run in experiment:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 0.30666666666666664
Accuracy: 0.24
Accuracy: 0.16
Accuracy: 0.24
Accuracy: 0.12
Accuracy: 0.14666666666666667
Accuracy: 0.16
Accuracy: 0.09333333333333334
Accuracy: 0.16
Accuracy: 0.17333333333333334
Accuracy: 0.21333333333333335
Accuracy: 0.18666666666666668
Accuracy: 0.30666666666666664
Accuracy: 0.2
Accuracy: 0.30666666666666664
Accuracy: 0.21333333333333335
Accuracy: 0.18666666666666668
Accuracy: 0.12
Accuracy: 0.25333333333333335
Accuracy: 0.18666666666666668


In [42]:
accs = np.array([run["accuracy"] for run in experiment])
print(accs.mean(), accs.var())

0.19866666666666669 0.0036782222222222225


In [None]:
classifiers = [run["classifier"] for run in experiment]

for classifier in classifiers:
    print(classifier.show_most_informative_features())

In [43]:
filename = "featuresQuixote_trigrams_punct.json"

experiment = [run_experiment_nltk(filename, folds=5) for _ in range(20)]

In [45]:
for run in experiment:
    print(f"Accuracy: {run['accuracy']:.2f}")

Accuracy: 0.76
Accuracy: 0.65
Accuracy: 0.64
Accuracy: 0.67
Accuracy: 0.68
Accuracy: 0.65
Accuracy: 0.79
Accuracy: 0.65
Accuracy: 0.72
Accuracy: 0.56
Accuracy: 0.56
Accuracy: 0.72
Accuracy: 0.64
Accuracy: 0.73
Accuracy: 0.67
Accuracy: 0.72
Accuracy: 0.67
Accuracy: 0.73
Accuracy: 0.69
Accuracy: 0.63


In [46]:
accs = np.array([run["accuracy"] for run in experiment])
print(accs.mean(), accs.var())

0.6766666666666665 0.0032866666666666643


# Ibsen

## First Experiment

In [8]:
filename3 = "featuresIbsen_cohesive.json"

experiment3 = [run_experiment_nltk(filename3, folds=10) for i in range(20)]

for run in experiment3:
    print(f"Accuracy: {run['accuracy']}")

Accuracy: 0.6521739130434783
Accuracy: 0.8260869565217391
Accuracy: 0.8260869565217391
Accuracy: 0.7391304347826086
Accuracy: 0.6086956521739131
Accuracy: 0.6956521739130435
Accuracy: 0.782608695652174
Accuracy: 0.8260869565217391
Accuracy: 0.6956521739130435
Accuracy: 0.782608695652174
Accuracy: 0.6956521739130435
Accuracy: 0.782608695652174
Accuracy: 0.6086956521739131
Accuracy: 0.6521739130434783
Accuracy: 0.6086956521739131
Accuracy: 0.8695652173913043
Accuracy: 0.7391304347826086
Accuracy: 0.6086956521739131
Accuracy: 0.6086956521739131
Accuracy: 0.6956521739130435


In [5]:
accs = np.array([run["accuracy"] for run in experiment3])
print(accs.mean(), accs.var())

0.7138297872340426 0.004638976912630147


In [None]:
classifiers3 = [run["classifier"] for run in experiment3]

for classifier in classifiers3:
    print(classifier.show_most_informative_features())

In [9]:
filename4 = "featuresIbsen_cohesive_punctuation.json"

experiment4 = [run_experiment_nltk(filename4, folds=10) for i in range(20)]

for run in experiment4:
    print("Accuracy:", run["accuracy"])

Accuracy: 0.9565217391304348
Accuracy: 0.9565217391304348
Accuracy: 0.8260869565217391
Accuracy: 0.7391304347826086
Accuracy: 0.9565217391304348
Accuracy: 0.8695652173913043
Accuracy: 0.9565217391304348
Accuracy: 1.0
Accuracy: 0.8260869565217391
Accuracy: 0.9130434782608695
Accuracy: 0.8695652173913043
Accuracy: 0.8695652173913043
Accuracy: 0.8695652173913043
Accuracy: 0.8260869565217391
Accuracy: 0.9130434782608695
Accuracy: 0.8695652173913043
Accuracy: 0.8695652173913043
Accuracy: 0.8260869565217391
Accuracy: 1.0
Accuracy: 0.8260869565217391


In [11]:
accs = np.array([run["accuracy"] for run in experiment4])
print(accs.mean(), accs.std())

0.8869565217391303 0.0665089501772972


In [None]:
classifiers4 = [run["classifier"] for run in experiment4]

for classifier in classifiers4:
    print(classifier.show_most_informative_features())