<a href="https://colab.research.google.com/github/ccaballeroh/Translator-Attribution/blob/master/02Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiments

This notebook contains the code to reproduce the results of the experiments. The experiments consist on doing a 10-fold cross-validation using four different classifiers: a linear support vector machine, a logistic regression, a naïve bayes classifier, and a decision tree, for all the feature sets obtained on [01Processing](./01Processing.ipynb). All the classifiers were trained using the default values (except the support vector machine which showed improvement for having the data with standard deviation $\sigma = 1$). The results are saved in a `DataFrame` for convenience and later saved to disk in three formats: CSV, HTML, and $\LaTeX$.

**Note:** Some of the feature sets are really big, and one of the classifiers does not support the use of sparse matrices, so it takes a lot of memory. The recommendation is to run this notebook on Google Colab when not doing `feature_selection`.

In [None]:
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import sys

In [None]:
IN_COLAB = 'google.colab' in sys.modules

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    ROOT = Path(r"./drive/My Drive/Translator-Attribution/")
    sys.path.insert(0,f"{ROOT}/")
    import warnings
    warnings.filterwarnings("ignore")
else:
    from helper.analysis import ROOT

In [None]:
from helper.analysis import get_dataset_from_json
from helper.analysis import JSON_FOLDER

In [None]:
features_files = [
                  file for file in JSON_FOLDER.iterdir()
                  if file.name.startswith("features")
                  ]

In [None]:
results_all_corpora = {}

for author in ["Quixote","Ibsen"]:

    indexes = []  # file names as indices
    cols = [
            "Dimension",
            "SVC",
            "Naïve Bayes",
            "Decision Tree",
            "Logistic Regression"
            ]
    results = []  # Where to hold the results
    
    for file in [file for file in features_files if author in file.name]:
        
        # Import data from JSON files
        X_dict, y_str = get_dataset_from_json(file)

        # Transformers to numpy arrays
        dict_vect = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        # Numeric conversion
        X = dict_vect.fit_transform(X_dict, )
        y = encoder.fit_transform(y_str)

        # Feature selection using chi-squared
        chi2_selector = SelectKBest(chi2, k = 45)
        X = chi2_selector.fit_transform(X, y)
        
        # Number of features
        dimension = X.shape[1]

        # K-fold to ingest cross-validation
        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        
        # Models
        
        ## SVM
        svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
                              ("scv", LinearSVC(random_state=42))])
        cv_svm = cross_val_score(svm_model, X, y, cv=kf)

        ## Logistic regresssion
        log_model = LogisticRegression(random_state=42)
        cv_log = cross_val_score(log_model, X, y, cv=kf)
        
        ## Naïve Bayes / doesn't take sparse matrix
        nb_model = MultinomialNB()
        cv_nb = cross_val_score(nb_model, X.toarray(), y, cv=kf)
        
        ## Decision Tree
        dt_model = DecisionTreeClassifier(random_state=42)
        cv_dt = cross_val_score(dt_model, X, y, cv=kf)
        
        # Results of cross-val for each feature set
        result_per_featureset = [
                                 dimension,
                                 cv_svm.mean(),
                                 cv_nb.mean(),
                                 cv_dt.mean(),
                                 cv_log.mean()
                                 ]
        
        # Overall results for each author
        results.append(result_per_featureset)        
        indexes.append(
            " ".join(file.stem.split("_")[2:])  # features from file name
            ) 
    
    # All features for all authors
    results_all_corpora[author] = pd.DataFrame(np.array(results), index=indexes, columns=cols)

## Save results to CSV, $\LaTeX$, and HTML

In [None]:
RESULTS_FOLDER = Path(fr"{ROOT}/results/")

In [None]:
for author in ["Quixote", "Ibsen"]:
    df = results_all_corpora[author].sort_index()
    
    df.to_csv(f"{RESULTS_FOLDER/(author +'_20200506.csv')}", float_format='%.4f')
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(RESULTS_FOLDER/(author+"_20200506.tex"), "w") as f:
        f.write(latex)
    
    html = df.to_html(float_format='%.4f')
    with open(RESULTS_FOLDER/(author+"_20200506.html"), "w") as f:
        f.write(html)