In [180]:
from collections import defaultdict
from helper.analysis import get_dataset_from_json
from helper.analysis import JSON_FOLDER
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.utils.multiclass import unique_labels
from typing import Dict
import numpy as np

In [33]:
features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("features")]

In [183]:
filtered_files = [file for file in features_files if "Quixote" in file.name and "cohesive" in file.name and "punct" in file.name]

In [184]:
filtered_files

[WindowsPath('auxfiles/json/featuresQuixote_cohesive_punct.json')]

In [185]:
file = filtered_files[0]

In [186]:
X_dict, y_str = get_dataset_from_json(file)

In [187]:
v = DictVectorizer(sparse=True)
encoder = LabelEncoder()

In [188]:
X, y = v.fit_transform(X_dict), encoder.fit_transform(y_str)

In [189]:
X_, y_ = shuffle(X, y, random_state=24)

dimension = X.shape[1]
m, n = X.shape

In [197]:
log_model = LogisticRegression()
cv = cross_val_score(log_model, X_, y_, cv=10)
print(f"Cross-validation mean accuracy: {cv.mean():.2%} with a standard deviation: {cv.std():.2%}")

Cross-validation mean accuracy: 97.37% with a standard deviation: 2.61%


In [199]:
log_model = LogisticRegression()
y_pred = cross_val_predict(log_model, X_, y_, cv=10)
print(accuracy_score(y_, y_pred, normalize=True))
confusion_matrix(y_, y_pred, labels=unique_labels(y_))

0.9735449735449735


array([[122,   2,   2],
       [  2, 124,   0],
       [  3,   1, 122]], dtype=int64)

In [192]:
log_model = LogisticRegression()
clf = log_model.fit(X_, y_)

In [169]:
from helper.utils import return_n_most_important

In [170]:
n = 30
most_relevant = return_n_most_important(clf=clf, v=v, encoder=encoder, n=n)

In [171]:
for label in unique_labels(y_str):
    print(f"The {n} most relevant {' '.join(file.stem.split('_')[1:])} features for {label} are:\n\n {'|'.join(most_relevant[label])}\n\n")

The 30 most relevant cohesive punct features for Archer are:

 too|. but,|: and|. that is|: but|) and|now.|, before|. now|before)|. there|again)|that is|. and,|! but|, after|too.|) now|--but|; and then|here|, then?|here.|, then.|still,|. certainly.|. indeed|(after|here--|: but,


The 30 most relevant cohesive punct features for Sharp are:

 ] and|] but|. then|] then|then,|--then|] now|[after|] there|before.|yet|[still|. and then|last.|, far|again!|first|then.|] here!|! and|; and|there.|now!|then?|certainly|then|, indeed.|to the left.|third|--there


