<a href="https://colab.research.google.com/github/ccaballeroh/Translator-Attribution/blob/master/02Experiments_Ibsen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (Optional)

If running in colab, execute the following cells.

In [None]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/', force_remount=True)
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")

In [None]:
if IN_COLAB:
  !pip install spacy==2.2.2
  !python -m spacy download en_core_web_md
else:
  try:
    import spacy
    nlp = spacy.load("en_core_web_md")
  except:
    !python -m spacy download en_core_web_md

Retrieving Processed Documents from Disk

We can pick up the process from this step retrieving the processed documents from disk.

If you just processed the documents, you can skip to `Features Extraction`.

In [None]:
from helper import ROOT
from pathlib import Path
import pickle
import platform


PICKLE = Path(fr"{ROOT}/auxfiles/pickle/")

author = "Ibsen"
with open(PICKLE/f"{author}_{platform.system()}.pickle", "rb") as f:
    doc_data=f.read()
docs = pickle.loads(doc_data)

## Extract features from parallel corpus (i.e., *Ghosts*)

In [None]:
from helper.analysis import save_dataset_to_json


# syntactic n-grams with n in {2, 3}
for n in range(2,4):
    FILE_TEMPLATE = f"Ibsen_Ghosts_syntactic_n{n}"
    save_dataset_to_json(
        [
        (doc.n_grams_syntactic(n=n, propn=False), doc.translator)
            for doc in docs
            if "Ghosts" in doc.filename
        ], FILE_TEMPLATE
    )

for punct in [True, False]:
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1,4):
        FILE_TEMPLATE = f"Ibsen_Ghosts_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=False, propn=False), doc.translator)
            for doc in docs
            if "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2,4):
        FILE_TEMPLATE = f"Ibsen_Ghosts_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator)
            for doc in docs
            if "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"Ibsen_Ghosts_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.cohesive(punct=punct), doc.translator)
                for doc in docs
                if "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )

## Extract features from non-parallel corpus (i.e., the other plays)

In [None]:
from helper.analysis import save_dataset_to_json


# syntactic n-grams with n in {2, 3}
for n in range(2,4):
    FILE_TEMPLATE = f"Ibsen_Others_syntactic_n{n}"
    save_dataset_to_json(
        [
        (doc.n_grams_syntactic(n=n, propn=False), doc.translator)
            for doc in docs
            if not "Ghosts" in doc.filename
        ], FILE_TEMPLATE
    )

for punct in [True, False]:
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1,4):
        FILE_TEMPLATE = f"Ibsen_Others_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=False, propn=False), doc.translator)
            for doc in docs
            if not "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2,4):
        FILE_TEMPLATE = f"Ibsen_Others_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator)
            for doc in docs
            if not "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"Ibsen_Others_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.cohesive(punct=punct), doc.translator)
                for doc in docs
                if not "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )

## Experiments

In [None]:
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import sys
import warnings
warnings.filterwarnings("ignore")

In [None]:
from helper.analysis import get_dataset_from_json
from helper.analysis import JSON_FOLDER

In [None]:
features_files = [
                  file for file in JSON_FOLDER.iterdir()
                  if file.name.startswith("Ibsen")
                  ]

In [None]:
results_all_corpora = {}

for corpus in ["Ghosts","Others"]:

    indexes = []  # file names as indices
    cols = [
            "Dimension",
            "SVC",
            "Naïve Bayes",
            "Decision Tree",
            "Logistic Regression"
            ]
    results = []  # Where to hold the results
    
    for file in [file for file in features_files if corpus in file.name]:
        
        # Import data from JSON files
        X_dict, y_str = get_dataset_from_json(file)

        # Transformers to numpy arrays
        dict_vect = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        # Numeric conversion
        X = dict_vect.fit_transform(X_dict, )
        y = encoder.fit_transform(y_str)
        
        #if X.shape[1] >= 45:
        #    # Feature selection using chi-squared
        #    chi2_selector = SelectKBest(chi2, k = 45)
        #    X = chi2_selector.fit_transform(X, y)
        
        # Number of features
        dimension = X.shape[1]

        # K-fold to ingest cross-validation
        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        
        # Models
        
        ## SVM
        svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
                              ("scv", LinearSVC(random_state=42))])
        cv_svm = cross_val_score(svm_model, X, y, cv=kf)

        ## Logistic regresssion
        log_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
                              ("lrc", LogisticRegression(random_state=42))]) 
        cv_log = cross_val_score(log_model, X, y, cv=kf)
        
        ## Naïve Bayes
        nb_model = MultinomialNB()
        cv_nb = cross_val_score(nb_model,X, y, cv=kf)
        
        ## Decision Tree
        dt_model = DecisionTreeClassifier(random_state=42)
        cv_dt = cross_val_score(dt_model, X, y, cv=kf)
        
        # Results of cross-val for each feature set
        result_per_featureset = [
                                 dimension,
                                 cv_svm.mean(),
                                 cv_nb.mean(),
                                 cv_dt.mean(),
                                 cv_log.mean()
                                 ]
        
        # Overall results for each author
        results.append(result_per_featureset)        
        indexes.append(
            " ".join(file.stem.split("_")[2:])  # features from file name
            ) 
    
    # All features for all authors
    results_all_corpora[corpus] = pd.DataFrame(np.array(results), index=indexes, columns=cols)

## Save results to CSV, $\LaTeX$, and HTML

In [None]:
RESULTS_FOLDER = Path(fr"{ROOT}/results/")

In [None]:
results_all_corpora["Ghosts"]

In [None]:
results_all_corpora["Others"]

In [None]:
for corpus in ["Ghosts", "Others"]:
    df = results_all_corpora[corpus].sort_index()
    
    df.to_csv(f"{RESULTS_FOLDER/(corpus + '_20200521.csv')}", float_format='%.4f')
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(RESULTS_FOLDER/(corpus + "_20200521.tex"), "w") as f:
        f.write(latex)
    
    html = df.to_html(float_format='%.4f')
    with open(RESULTS_FOLDER/(corpus + "_20200521.html"), "w") as f:
        f.write(html)

# Most important features extraction

In [None]:
from helper.features import convert_data, plot_most_relevant, plot_confusion_matrix, train_extract_most_relevant, save_tables

These are the files to process. They are the entirety of the feature sets obtained using [01Processing](./01Processing.ipynb).

In [None]:
from helper.analysis import JSON_FOLDER
features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("Ibsen")]

## Most Relevant Features

The next cells define a couple of functions to generate and save the bar plots and tabular data of the $n=15$ most relevant features in the classification process for each translator and each feature set using three classifiers: Logistic Regression, Linear Support Vector Machine, and Naïve Bayes.

To do feature selection using the $\chi^2$ statistic, leave the following cell to `True`. Otherwise, change it to `False`.



In [None]:
feature_selection = True

In [None]:
for model_name in ["LogisticRegression", "SVM", "NaiveBayes"]:
    for corpus in ["Ghosts", "Others"]:
        for file in [file for file in features_files if corpus in file.name]:
            data = convert_data(file=file)
            args = {
                "model_name" : model_name,
                "X":data["X"],
                "y":data["y"],
                "encoder":data["encoder"],
                "dict_vectorizer":data["dict_vectorizer"],
                "feature_selection":feature_selection
                
            }
            exp_results = train_extract_most_relevant(**args)            
            most_relevant = exp_results["most_relevant"]

            for translator in data["encoder"].classes_:
                plot_most_relevant(data=most_relevant, translator=translator, model=model_name, file=file)
                df = most_relevant[translator]
                save_tables(df=df, translator=translator, file=file, model_name=model_name)
            

## Confusion Matrices

The following code generates the Confusion Matrices for all the feature sets using a logistic regression classifier.

In [None]:
for author in ["Ibsen", "Quixote"]:
    for file in [file for file in features_files if author in file.name]:
        data = convert_data(file=file)
        X = data["X"]
        y = data["y"]
        encoder = data["encoder"]
        plot_confusion_matrix(X=X, y=y, encoder=encoder, file=file)

## Mixed Corpora

In [None]:
def run_all_classifiers(file_train: Path, file_test: Path, k: int = 45):
     

    X_train_dict, y_train_str = get_dataset_from_json(file_train)
    X_test_dict, y_test_str = get_dataset_from_json(file_test)

    dict_vectorizer = DictVectorizer(sparse=True)
    encoder = LabelEncoder()

    X_train, y_train = (
        dict_vectorizer.fit_transform(X_train_dict),
        encoder.fit_transform(y_train_str),
    )

    X_test, y_test = (
        dict_vectorizer.transform(X_test_dict),
        encoder.transform(y_test_str),
    )

    # Feature selection
    chi2_selector = SelectKBest(chi2, k=k)
    if X_train.shape[1] >= k:
        X_train = chi2_selector.fit_transform(X_train, y_train)
    if X_test.shape[1] >= k:
        X_test = chi2_selector.transform(X_test)

    # Shuffle
    X_train_, y_train_ = shuffle(
        X_train, y_train, random_state=24
    )
    # Models
    svm_model = Pipeline(
        [
            ("scaler", StandardScaler(with_mean=False)),
            ("scv", LinearSVC(random_state=42)),
        ]
    )
    log_model = Pipeline(
        [
            ("scaler", StandardScaler(with_mean=False)),
            ("lr", LogisticRegression(random_state=42)),
        ]
    )

    nb_model = MultinomialNB()
    dt_model = DecisionTreeClassifier(random_state=42)

    svm_model.fit(X_train_, y_train_)
    log_model.fit(X_train_, y_train_)
    nb_model.fit(X_train_, y_train_)
    dt_model.fit(X_train_, y_train_)

    return [
        X_train_.shape[1],
        accuracy_score(y_test, svm_model.predict(X_test)),
        accuracy_score(y_test, log_model.predict(X_test)),
        accuracy_score(y_test, nb_model.predict(X_test)),
        accuracy_score(y_test, dt_model.predict(X_test)),
    ]

In [None]:
from helper.analysis import JSON_FOLDER, get_dataset_from_json
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from helper import ROOT

features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("Ibsen")]

In [None]:
ghosts = [file for file in features_files if "Ghosts" in file.stem]
others = [file for file in features_files if not "Ghosts" in file.stem]

In [None]:
features = [(train, test) for train, test in zip(ghosts, others) if " ".join(train.stem.split("_")[2:]) == " ".join(test.stem.split("_")[2:])]

In [None]:
columns = [
    "Dimension",
    "SVC",
    "Naïve Bayes",
    "Decision Tree",
    "Logistic Regression",
]

indexes = [" ".join(train.stem.split("_")[2:]) for train, test in features]

results_parallel = [run_all_classifiers(train, test) for train, test in features]
results_parallel_df = pd.DataFrame(
    np.array(results_parallel), index=indexes, columns=columns
)

results_inverse = [run_all_classifiers(train, test) for test, train in features]
results_inverse_df = pd.DataFrame(
    np.array(results_inverse), index=indexes, columns=columns
)

RESULTS_FOLDER = Path(fr"{ROOT}/results/")

d = {"parallel": results_parallel_df, "inverse": results_inverse_df}

In [None]:
d["parallel"]

In [None]:
d["inverse"]

In [None]:
for exp in d:
    df = d[exp]
    latex = df.to_latex(float_format=lambda x: "%.4f" % x)

    with open((RESULTS_FOLDER / f"{exp}_20200521.tex"), "w") as f:
        f.write(latex)

    html = df.to_html(float_format="%.4f")
    with open((RESULTS_FOLDER / f"{exp}_20200521.html"), "w") as f:
        f.write(html)