<a href="https://colab.research.google.com/github/ccaballeroh/Translator-Attribution/blob/master/03Most_important.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [2]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")
    import warnings
    warnings.filterwarnings("ignore")
else:
    from helper.analysis import ROOT

In [3]:
from collections import defaultdict
from helper.analysis import get_dataset_from_json
from helper.analysis import JSON_FOLDER
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
from sklearn.utils.multiclass import unique_labels
from typing import Dict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
RESULTS_FOLDER = Path(fr"{ROOT}/results/")
if not RESULTS_FOLDER.exists():
    RESULTS_FOLDER.mkdir()

TABLES_FOLDER = RESULTS_FOLDER / "tables"
if not TABLES_FOLDER.exists():
    TABLES_FOLDER.mkdir()

FIGS_FOLDER = RESULTS_FOLDER / "figs"
if not FIGS_FOLDER.exists():
    FIGS_FOLDER.mkdir()

CONF_MAT_FOLDER = FIGS_FOLDER / "cm"
if not CONF_MAT_FOLDER.exists():
    CONF_MAT_FOLDER.mkdir()

MOST_RELEVANT_FOLDER = FIGS_FOLDER / "most"
if not MOST_RELEVANT_FOLDER.exists():
    MOST_RELEVANT_FOLDER.mkdir()


features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("features")]

# Confusion Matrices

In [5]:
sns.set(font_scale=1.4)
for author in ["Ibsen", "Quixote"]:
    for file in [file for file in features_files if author in file.name]:
        X_dict, y_str = get_dataset_from_json(file)
        v = DictVectorizer(sparse=True)
        encoder = LabelEncoder()
        
        X, y = v.fit_transform(X_dict), encoder.fit_transform(y_str)
        
        X_, y_ = shuffle(X, y, random_state=24)
        
        log_model = LogisticRegression()

        y_pred = cross_val_predict(log_model, X_, y_, cv=10)
        cm = confusion_matrix(y_, y_pred, labels=unique_labels(y_))

        df = pd.DataFrame(cm, index=encoder.classes_, columns=encoder.classes_)

        cm_plot = sns.heatmap(df, annot=True, cbar=None, cmap="Blues", fmt="d", annot_kws={"size":18})
        plt.title(f"{' '.join(file.stem.split('_')[1:])}")
        plt.tight_layout()
        plt.ylabel("True translator")
        plt.xlabel("Predicted translator")
        plt.savefig(CONF_MAT_FOLDER/f"cm_{file.stem}.png", bbox_inches="tight", )
        plt.clf()
        

<Figure size 432x288 with 0 Axes>

# Most important

In [6]:
from helper.utils import return_n_most_important

sns.set_style("whitegrid")

def plot_most_relevant(
    *, data: pd.DataFrame, translator: str, model: str, file: Path
) -> None:
    plot = sns.barplot(
        x=data[translator]["Weight"], y=data[translator]["Feature"], palette="cividis",
    )
    features = " ".join(file.stem.split("_")[1:])
    plot.set(title=f"{translator} - {model} - {features}")
    fig = plot.get_figure()
    fig.savefig(
        MOST_RELEVANT_FOLDER / f"{file.stem}_{translator}_{model}.png", bbox_inches="tight",
    )
    fig.clf()

def save_tables(*, df:pd.DataFrame, translator:str, file:Path, model_name:str)-> None:
    df.to_csv(TABLES_FOLDER / f"{file.stem}_{translator}_{model_name}.csv", float_format='%.4f')

    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(TABLES_FOLDER /f"{file.stem}_{translator}_{model_name}.tex", "w") as f:
        f.write(latex)
    
    html = df.to_html(float_format='%.4f')
    with open(TABLES_FOLDER /f"{file.stem}_{translator}_{model_name}.html", "w") as f:
        f.write(html)

In [7]:
model_names = ["LogisticRegression", "SVM", "Naïve Bayes"]
for model_name in model_names:
    for author in ["Ibsen", "Quixote"]:
        for file in [file for file in features_files if author in file.name]:
            X_dict, y_str = get_dataset_from_json(file)
            
            v = DictVectorizer(sparse=True)
            encoder = LabelEncoder()         
            
            X, y = v.fit_transform(X_dict), encoder.fit_transform(y_str)
                    
            X_, y_ = shuffle(X, y, random_state=24)

            if model_name == "LogisticRegression":
                model = LogisticRegression()
            elif model_name == "SVM":
                model = LinearSVC()
            elif model_name == "Naïve Bayes":
                model = MultinomialNB()
            else:
                raise NotImplementedError

            clf = model.fit(X_, y_)

            most_relevant = return_n_most_important(clf=clf, v=v, encoder=encoder, n=15)

            for translator in encoder.classes_:
                plot_most_relevant(data=most_relevant, translator=translator, model=model_name, file=file)
                df = most_relevant[translator]
                save_tables(df=df, translator=translator, file=file, model_name=model_name)
            

<Figure size 432x288 with 0 Axes>