<a href="https://colab.research.google.com/github/ccaballeroh/Translator-Attribution/blob/master/03Most_important.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [2]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")
    import warnings
    warnings.filterwarnings("ignore")
else:
    from helper.analysis import ROOT

In [18]:
from collections import defaultdict
from helper.analysis import get_dataset_from_json
from helper.analysis import JSON_FOLDER
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.utils.multiclass import unique_labels
from typing import Dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set(font_scale=1.4)

In [26]:
RESULTS_FOLDER = Path(r"./results/")
if not RESULTS_FOLDER.exists():
    RESULTS_FOLDER.mkdir()

FIGS_FOLDER = RESULTS_FOLDER / "figs"
if not FIGS_FOLDER.exists():
    FIGS_FOLDER.mkdir()


features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("features")]

In [25]:
for author in ["Ibsen", "Quixote"]:
    for file in [file for file in features_files if author in file.name]:
        X_dict, y_str = get_dataset_from_json(file)
        v = DictVectorizer(sparse=True)
        encoder = LabelEncoder()
        
        X, y = v.fit_transform(X_dict), encoder.fit_transform(y_str)
        
        X_, y_ = shuffle(X, y, random_state=24)

        #log_model = LogisticRegression()
        #cv = cross_val_score(log_model, X_, y_, cv=10)
        
        print(
            f"{author} {' '.join(file.stem.split('_')[1:])}\t"
            f"Cross-validation mean accuracy: {cv.mean():.2%}\t" 
            f"with a standard deviation: {cv.std():.2%}"
            )
        
        log_model = LogisticRegression()
        y_pred = cross_val_predict(log_model, X_, y_, cv=10)
        cm = confusion_matrix(y_, y_pred, labels=unique_labels(y_))

        df = pd.DataFrame(cm, index=encoder.classes_, columns=encoder.classes_)

        cm_plot = sns.heatmap(df, annot=True, cbar=None, cmap="Blues", fmt="d", annot_kws={"size":18})
        plt.title(f"{' '.join(file.stem.split('_')[1:])}")
        plt.tight_layout()
        plt.ylabel("True translator")
        plt.xlabel("Predicted translator")
        plt.savefig(FIGS_FOLDER/f"cm_{file.stem}.png", bbox_inches="tight", )
        plt.clf()
        

Ibsen syntactic n2	Cross-validation mean accuracy: 98.71%	with a standard deviation: 1.97%
Ibsen syntactic n3	Cross-validation mean accuracy: 98.71%	with a standard deviation: 1.97%


<Figure size 432x288 with 0 Axes>

In [6]:
clf = log_model.fit(X_, y_)

In [7]:
from helper.utils import return_n_most_important

In [8]:
n = 30
most_relevant = return_n_most_important(clf=clf, v=v, encoder=encoder, n=n)

In [9]:
for label in unique_labels(y_str):
    print(f"The {n} most relevant {' '.join(file.stem.split('_')[1:])} features for {label} are:\n\n {'|'.join(most_relevant[label])}\n\n")

The 30 most relevant 2grams features for Jarvis are:

 answered sancho|the priest|answered don|in short|answered the|signor don|your worship|you are|a mind|de la|do not|quixote de|so much|upon the|of your|by the|mind to|sancho panza|to be|i will|that of|of my|sancho answered|you have|i am|an account|and therefore|priest and|and though|those who


The 30 most relevant 2grams features for Ormsby are:

 said sancho|on the|do n't|said don|of la|at once|will be|the curate|said the|your worship|quixote of|to him|senor don|has been|no doubt|it was|such a|to say|as he|i 'm|did not|there was|and as|that is|to me|have been|as the|thou art|wo n't|as a


The 30 most relevant 2grams features for Shelton are:

 quoth don|quoth the|that he|quoth sancho|that i|he that|that which|he hath|unto him|hath been|of them|and that|and therefore|sancho quoth|waiting women|those that|that thou|that hath|that you|as he|which i|i 'll|it were|that they|thou shalt|i would|of all|he might|signior don|so that


