<a href="https://colab.research.google.com/github/ccaballeroh/Translator-Attribution/blob/master/03Most_important.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [0]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")
    import warnings
    warnings.filterwarnings("ignore")
else:
    from helper.analysis import ROOT

In [0]:
from collections import defaultdict
from helper.analysis import get_dataset_from_json
from helper.analysis import JSON_FOLDER
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.utils.multiclass import unique_labels
from typing import Dict
import numpy as np

In [0]:
features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("features")]

In [0]:
filtered_files = [file for file in features_files if "Quixote" in file.name and "cohesive" in file.name and "punct" in file.name]

In [0]:
filtered_files

In [0]:
file = filtered_files[0]

In [0]:
X_dict, y_str = get_dataset_from_json(file)

In [0]:
v = DictVectorizer(sparse=True)
encoder = LabelEncoder()

In [0]:
X, y = v.fit_transform(X_dict), encoder.fit_transform(y_str)

In [0]:
X_, y_ = shuffle(X, y, random_state=24)

dimension = X.shape[1]
m, n = X.shape

In [0]:
log_model = LogisticRegression()
cv = cross_val_score(log_model, X_, y_, cv=10)
print(f"Cross-validation mean accuracy: {cv.mean():.2%} with a standard deviation: {cv.std():.2%}")

In [0]:
log_model = LogisticRegression()
y_pred = cross_val_predict(log_model, X_, y_, cv=10)
print(accuracy_score(y_, y_pred, normalize=True))
cm = confusion_matrix(y_, y_pred, labels=unique_labels(y_))

In [0]:
import matplotlib.pyplot as plt
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plot_confusion_matrix(cm, encoder.classes_)

In [0]:
clf = log_model.fit(X_, y_)

In [0]:
from helper.utils import return_n_most_important

In [0]:
n = 30
most_relevant = return_n_most_important(clf=clf, v=v, encoder=encoder, n=n)

In [0]:
for label in unique_labels(y_str):
    print(f"The {n} most relevant {' '.join(file.stem.split('_')[1:])} features for {label} are:\n\n {'|'.join(most_relevant[label])}\n\n")