<a href="https://colab.research.google.com/github/ccaballeroh/MCPR-2021/blob/main/03Experiments_Ibsen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [2]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/', force_remount=True)
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")
else:
    from helper import ROOT

Mounted at /content/drive/


# Experiments

## Load modules

In [3]:
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from helper.analysis import JSON_FOLDER, get_dataset_from_json

warnings.filterwarnings("ignore")

In colab!


## Experiments for the Ibsen corpus

In [4]:
features_files = [
    file for file in JSON_FOLDER.iterdir() if file.name.startswith("Ibsen")
]

Experiments performing feature selection

In [6]:
results_all_corpora = {}

for corpus in ["Ghosts", "Others"]:

    indexes = []  # file names as indices
    cols = ["Dimension", "SVC", "Naïve Bayes", "Decision Tree", "Logistic Regression"]
    results = []  # Where to hold the results per corpus

    for file in [file for file in features_files if corpus in file.name]:

        # Import data from JSON files
        X_dict, y_str = get_dataset_from_json(file)

        # Transformers to numpy arrays
        dict_vect = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        # Numeric conversion
        X = dict_vect.fit_transform(X_dict,)
        y = encoder.fit_transform(y_str)

        # Number of features
        k = 18  # number of features to select
        dimension = k  # comment out this line and uncomment next for full data set
       # dimension = X.shape[1]

        # K-fold to ingest cross-validation
        kf = KFold(n_splits=10, shuffle=True, random_state=42)

        # Models

        ## SVM
        svm_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("scaler", StandardScaler(with_mean=False)),
                ("scv", LinearSVC(random_state=42)),
            ]
        )
        cv_svm = cross_val_score(svm_model, X, y, cv=kf)

        ## Logistic regresssion
        log_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("scaler", StandardScaler(with_mean=False)),
                ("lrc", LogisticRegression(random_state=42)),
            ]
        )
        cv_log = cross_val_score(log_model, X, y, cv=kf)

        ## Naïve Bayes
        nb_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("nb", MultinomialNB()),
            ]
        )
        cv_nb = cross_val_score(nb_model, X, y, cv=kf)

        ## Decision Tree
        dt_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("dt", DecisionTreeClassifier(random_state=42)),
            ]
        )
        cv_dt = cross_val_score(dt_model, X, y, cv=kf)

        # Results of cross-val for each feature set
        result_per_featureset = [
            dimension,
            cv_svm.mean(),
            cv_nb.mean(),
            cv_dt.mean(),
            cv_log.mean(),
        ]

        # Overall results for each author
        results.append(result_per_featureset)
        indexes.append(" ".join(file.stem.split("_")[2:]))  # features from file name

    # All features for all authors
    results_all_corpora[corpus] = pd.DataFrame(
        np.array(results), index=indexes, columns=cols
    )

## Save results $\LaTeX$

In [7]:
results_all_corpora["Ghosts"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
syntactic n2,18.0,0.695,0.61,0.68,0.715
1grams punct,18.0,0.88,0.77,0.65,0.92
syntactic n3,18.0,0.58,0.53,0.64,0.51
3gramsPOS punct,18.0,1.0,1.0,0.915,1.0
cohesive punct,18.0,0.575,0.66,0.635,0.615
2grams punct,18.0,0.94,0.98,0.955,0.98
2grams,18.0,0.835,0.855,0.72,0.815
3grams punct,18.0,0.98,1.0,0.955,1.0
1grams,18.0,0.735,0.81,0.59,0.815
2gramsPOS punct,18.0,0.98,0.98,0.955,0.955


In [8]:
results_all_corpora["Others"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
syntactic n2,18.0,0.867647,0.872876,0.849346,0.878431
1grams punct,18.0,0.918954,0.79183,0.872222,0.918954
syntactic n3,18.0,0.84902,0.861111,0.836928,0.866993
3grams punct,18.0,0.988235,0.988235,0.98268,0.988235
2gramsPOS punct,18.0,0.994118,0.988235,0.976797,0.988235
2grams punct,18.0,0.994118,0.994118,0.98268,0.994118
cohesive punct,18.0,0.849673,0.843791,0.843464,0.86732
3gramsPOS punct,18.0,0.994118,0.976471,0.988235,0.994118
1grams,18.0,0.890523,0.913399,0.803268,0.914052
2grams,18.0,0.924183,0.935948,0.826797,0.947712


In [None]:
RESULTS_FOLDER = Path(fr"{ROOT}/results/")

for corpus in ["Ghosts", "Others"]:
    df = results_all_corpora[corpus].sort_index()
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(RESULTS_FOLDER/(corpus + ".tex"), "w") as f:
        f.write(latex)

## Mixed Corpora

In [10]:
def run_all_classifiers(file_train: Path, file_test: Path, k: int = 18):

    X_train_dict, y_train_str = get_dataset_from_json(file_train)
    X_test_dict, y_test_str = get_dataset_from_json(file_test)

    dict_vectorizer = DictVectorizer(sparse=True)
    encoder = LabelEncoder()

    X_train, y_train = (
        dict_vectorizer.fit_transform(X_train_dict),
        encoder.fit_transform(y_train_str),
    )

    X_test, y_test = (
        dict_vectorizer.transform(X_test_dict),
        encoder.transform(y_test_str),
    )

    # Shuffle
    X_train_, y_train_ = shuffle(
        X_train, y_train, random_state=24
    )
    # Models
    svm_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("scaler", StandardScaler(with_mean=False)),
            ("scv", LinearSVC(random_state=42)),
        ]
    )
    
    log_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("scaler", StandardScaler(with_mean=False)),
            ("lr", LogisticRegression(random_state=42)),
        ]
    )

    nb_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("nb", MultinomialNB()),
        ])
    
    dt_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("dt", DecisionTreeClassifier(random_state=42)),
        ]) 
    
    

    svm_model.fit(X_train_, y_train_)
    log_model.fit(X_train_, y_train_)
    nb_model.fit(X_train_, y_train_)
    dt_model.fit(X_train_, y_train_)

    return [
        k,
        accuracy_score(y_test, svm_model.predict(X_test)),
        accuracy_score(y_test, log_model.predict(X_test)),
        accuracy_score(y_test, nb_model.predict(X_test)),
        accuracy_score(y_test, dt_model.predict(X_test)),
    ]

In [18]:
from helper.analysis import JSON_FOLDER, get_dataset_from_json
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from helper import ROOT

features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("Ibsen")]

In [19]:
features_files

[PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_syntactic_n2.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_1grams_punct.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_syntactic_n3.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_3gramsPOS_punct.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_cohesive_punct.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_2grams_punct.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_2grams.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_3grams_punct.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_1grams.json'),
 PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_2gramsPOS_punct.json'),
 PosixPath('drive/My Dri

In [20]:
ghosts = [file for file in features_files if "Ghosts" in file.stem]
others = [file for file in features_files if not "Ghosts" in file.stem]

In [21]:
features = [(train, test) for train, test in zip(ghosts, others) if " ".join(train.stem.split("_")[2:]) == " ".join(test.stem.split("_")[2:])]

In [22]:
features

[(PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_syntactic_n2.json'),
  PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Others_syntactic_n2.json')),
 (PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_1grams_punct.json'),
  PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Others_1grams_punct.json')),
 (PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_syntactic_n3.json'),
  PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Others_syntactic_n3.json')),
 (PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_2grams_punct.json'),
  PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Others_2grams_punct.json')),
 (PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Ghosts_1grams.json'),
  PosixPath('drive/My Drive/Translator-Attribution/auxfiles/json/Ibsen_Others_1grams.json')),
 (PosixPath('driv

In [14]:
columns = [
    "Dimension",
    "SVC",
    "Naïve Bayes",
    "Decision Tree",
    "Logistic Regression",
]

indexes = [" ".join(train.stem.split("_")[2:]) for train, test in features]

results_parallel = [run_all_classifiers(train, test) for train, test in features]
results_parallel_df = pd.DataFrame(
    np.array(results_parallel), index=indexes, columns=columns
)

results_inverse = [run_all_classifiers(train, test) for test, train in features]
results_inverse_df = pd.DataFrame(
    np.array(results_inverse), index=indexes, columns=columns
)

RESULTS_FOLDER = Path(fr"{ROOT}/results/")

d = {"parallel": results_parallel_df, "inverse": results_inverse_df}

In [15]:
d["parallel"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
syntactic n2,18.0,0.485549,0.526012,0.526012,0.508671
1grams punct,18.0,0.606936,0.699422,0.867052,0.693642
syntactic n3,18.0,0.450867,0.531792,0.421965,0.358382
2grams punct,18.0,0.982659,0.965318,0.988439,0.797688
1grams,18.0,0.653179,0.791908,0.745665,0.630058
3grams,18.0,0.624277,0.630058,0.65896,0.572254
2gramsPOS,18.0,0.66474,0.722543,0.739884,0.595376


In [16]:
d["inverse"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
syntactic n2,18.0,0.571429,0.591837,0.571429,0.612245
1grams punct,18.0,0.632653,0.612245,0.591837,0.693878
syntactic n3,18.0,0.530612,0.530612,0.510204,0.530612
2grams punct,18.0,0.897959,0.836735,0.897959,0.836735
1grams,18.0,0.591837,0.612245,0.591837,0.489796
3grams,18.0,0.469388,0.469388,0.489796,0.489796
2gramsPOS,18.0,0.571429,0.55102,0.55102,0.612245


In [None]:
for exp in d:
    df = d[exp].sort_index()
    
    latex = df.to_latex(float_format=lambda x: "%.4f" % x)

    with open((RESULTS_FOLDER / f"{exp}.tex"), "w") as f:
        f.write(latex)