<a href="https://colab.research.google.com/github/ccaballeroh/MCPR-2021/blob/main/03Experiments_Ibsen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/', force_remount=True)
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")
else:
    from helper import ROOT

# Experiments

## Load modules

In [None]:
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from helper.analysis import JSON_FOLDER, get_dataset_from_json

warnings.filterwarnings("ignore")

## Experiments for the Ibsen corpus

In [None]:
features_files = [
    file for file in JSON_FOLDER.iterdir() if file.name.startswith("Ibsen")
]

Experiments performing feature selection

In [None]:
%%time
results_all_corpora = {}

for corpus in ["Ghosts", "Others"]:

    indexes = []  # file names as indices
    cols = ["Dimension", "SVC", "Naïve Bayes", "Decision Tree", "Logistic Regression"]
    results = []  # Where to hold the results per corpus

    for file in [file for file in features_files if corpus in file.name]:

        # Import data from JSON files
        X_dict, y_str = get_dataset_from_json(file)

        # Transformers to numpy arrays
        dict_vect = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        # Numeric conversion
        X = dict_vect.fit_transform(X_dict,)
        y = encoder.fit_transform(y_str)

        # Number of features
        k = 25  # number of features to select
        dimension = k  # comment out this line and uncomment next for full data set
       # dimension = X.shape[1]

        # K-fold to ingest cross-validation
        kf = KFold(n_splits=10, shuffle=True, random_state=42)

        # Models

        ## SVM
        svm_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("scaler", StandardScaler(with_mean=False)),
                ("scv", LinearSVC(random_state=42)),
            ]
        )
        cv_svm = cross_val_score(svm_model, X, y, cv=kf)

        ## Logistic regresssion
        log_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("scaler", StandardScaler(with_mean=False)),
                ("lrc", LogisticRegression(random_state=42)),
            ]
        )
        cv_log = cross_val_score(log_model, X, y, cv=kf)

        ## Naïve Bayes
        nb_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("nb", MultinomialNB()),
            ]
        )
        cv_nb = cross_val_score(nb_model, X, y, cv=kf)

        ## Decision Tree
        dt_model = Pipeline(
            [
                ("feat-sel", SelectKBest(chi2, k=k)),
                ("dt", DecisionTreeClassifier(random_state=42)),
            ]
        )
        cv_dt = cross_val_score(dt_model, X, y, cv=kf)

        # Results of cross-val for each feature set
        result_per_featureset = [
            dimension,
            cv_svm.mean(),
            cv_nb.mean(),
            cv_dt.mean(),
            cv_log.mean(),
        ]

        # Overall results for each author
        results.append(result_per_featureset)
        indexes.append(" ".join(file.stem.split("_")[2:]))  # features from file name

    # All features for all authors
    results_all_corpora[corpus] = pd.DataFrame(
        np.array(results), index=indexes, columns=cols
    )

Wall time: 2min 3s


## Save results $\LaTeX$

In [None]:
results_all_corpora["Ghosts"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
1grams,25.0,0.68,0.815,0.61,0.84
1grams punct,25.0,0.9,0.815,0.745,0.855
2grams,25.0,0.755,0.755,0.72,0.78
2gramsPOS,25.0,0.62,0.515,0.575,0.595
2gramsPOS punct,25.0,1.0,1.0,0.935,0.975
2grams punct,25.0,0.94,0.98,0.955,0.98
3grams,25.0,0.595,0.6,0.47,0.575
3gramsPOS,25.0,0.24,0.225,0.37,0.28
3gramsPOS punct,25.0,1.0,1.0,0.935,1.0
3grams punct,25.0,1.0,1.0,0.935,1.0


In [None]:
results_all_corpora["Others"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
1grams,25.0,0.890523,0.919281,0.803268,0.902288
1grams punct,25.0,0.918627,0.797712,0.866013,0.930392
2grams,25.0,0.930719,0.931046,0.814706,0.942484
2gramsPOS,25.0,0.838235,0.855882,0.728758,0.862092
2gramsPOS punct,25.0,0.988235,0.994118,0.98268,0.994118
2grams punct,25.0,0.988235,0.994118,0.98268,0.994118
3grams,25.0,0.90098,0.90719,0.808497,0.894771
3gramsPOS,25.0,0.844444,0.873856,0.677451,0.838562
3gramsPOS punct,25.0,0.994118,0.959477,0.970588,0.994118
3grams punct,25.0,0.988235,0.994118,0.98268,0.988235


In [None]:
RESULTS_FOLDER = Path(fr"{ROOT}/results/")

for corpus in ["Ghosts", "Others"]:
    df = results_all_corpora[corpus].sort_index()
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(RESULTS_FOLDER/(corpus + ".tex"), "w") as f:
        f.write(latex)

## Mixed Corpora

In [None]:
def run_all_classifiers(file_train: Path, file_test: Path, k: int = 25):

    X_train_dict, y_train_str = get_dataset_from_json(file_train)
    X_test_dict, y_test_str = get_dataset_from_json(file_test)

    dict_vectorizer = DictVectorizer(sparse=True)
    encoder = LabelEncoder()

    X_train, y_train = (
        dict_vectorizer.fit_transform(X_train_dict),
        encoder.fit_transform(y_train_str),
    )

    X_test, y_test = (
        dict_vectorizer.transform(X_test_dict),
        encoder.transform(y_test_str),
    )

    # Shuffle
    X_train_, y_train_ = shuffle(
        X_train, y_train, random_state=24
    )
    # Models
    svm_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("scaler", StandardScaler(with_mean=False)),
            ("scv", LinearSVC(random_state=42)),
        ]
    )
    
    log_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("scaler", StandardScaler(with_mean=False)),
            ("lr", LogisticRegression(random_state=42)),
        ]
    )

    nb_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("nb", MultinomialNB()),
        ])
    
    dt_model = Pipeline(
        [
            ("feat-sel", SelectKBest(chi2, k=k)),
            ("dt", DecisionTreeClassifier(random_state=42)),
        ]) 
    
    

    svm_model.fit(X_train_, y_train_)
    log_model.fit(X_train_, y_train_)
    nb_model.fit(X_train_, y_train_)
    dt_model.fit(X_train_, y_train_)

    return [
        k,
        accuracy_score(y_test, svm_model.predict(X_test)),
        accuracy_score(y_test, log_model.predict(X_test)),
        accuracy_score(y_test, nb_model.predict(X_test)),
        accuracy_score(y_test, dt_model.predict(X_test)),
    ]

In [None]:
from helper.analysis import JSON_FOLDER, get_dataset_from_json
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from helper import ROOT

features_files = [file for file in JSON_FOLDER.iterdir() if file.name.startswith("Ibsen")]

In [None]:
ghosts = [file for file in features_files if "Ghosts" in file.stem]
others = [file for file in features_files if not "Ghosts" in file.stem]

In [None]:
features = [(train, test) for train, test in zip(ghosts, others) if " ".join(train.stem.split("_")[2:]) == " ".join(test.stem.split("_")[2:])]

In [None]:
columns = [
    "Dimension",
    "SVC",
    "Naïve Bayes",
    "Decision Tree",
    "Logistic Regression",
]

indexes = [" ".join(train.stem.split("_")[2:]) for train, test in features]

results_parallel = [run_all_classifiers(train, test) for train, test in features]
results_parallel_df = pd.DataFrame(
    np.array(results_parallel), index=indexes, columns=columns
)

results_inverse = [run_all_classifiers(train, test) for test, train in features]
results_inverse_df = pd.DataFrame(
    np.array(results_inverse), index=indexes, columns=columns
)

RESULTS_FOLDER = Path(fr"{ROOT}/results/")

d = {"parallel": results_parallel_df, "inverse": results_inverse_df}

In [None]:
d["parallel"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
1grams,25.0,0.66474,0.722543,0.780347,0.537572
1grams punct,25.0,0.393064,0.439306,0.861272,0.653179
2grams,25.0,0.554913,0.566474,0.606936,0.514451
2gramsPOS,25.0,0.595376,0.67052,0.589595,0.66474
2gramsPOS punct,25.0,0.947977,0.930636,0.959538,0.780347
2grams punct,25.0,0.780347,0.774566,0.942197,0.797688
3grams,25.0,0.630058,0.687861,0.653179,0.462428
3gramsPOS,25.0,0.531792,0.537572,0.537572,0.554913
3gramsPOS punct,25.0,0.976879,0.971098,0.947977,0.791908
3grams punct,25.0,0.942197,0.971098,0.988439,0.768786


In [None]:
d["inverse"]

Unnamed: 0,Dimension,SVC,Naïve Bayes,Decision Tree,Logistic Regression
1grams,25.0,0.632653,0.653061,0.55102,0.612245
1grams punct,25.0,0.632653,0.612245,0.612245,0.653061
2grams,25.0,0.734694,0.734694,0.591837,0.571429
2gramsPOS,25.0,0.591837,0.591837,0.571429,0.510204
2gramsPOS punct,25.0,0.877551,0.877551,0.979592,0.836735
2grams punct,25.0,0.918367,0.857143,0.897959,0.836735
3grams,25.0,0.530612,0.530612,0.510204,0.428571
3gramsPOS,25.0,0.591837,0.55102,0.55102,0.469388
3gramsPOS punct,25.0,0.816327,0.836735,0.959184,1.0
3grams punct,25.0,0.897959,0.877551,0.918367,0.836735


In [None]:
for exp in d:
    df = d[exp].sort_index()
    
    latex = df.to_latex(float_format=lambda x: "%.4f" % x)

    with open((RESULTS_FOLDER / f"{exp}.tex"), "w") as f:
        f.write(latex)