<a href="https://colab.research.google.com/github/ccaballeroh/Translator-Attribution/blob/master/02Experiments_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from collections import defaultdict
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
import numpy as np
import os
import pandas as pd
import sys

In [0]:
IN_COLAB = 'google.colab' in sys.modules

In [31]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    FOLDER_thesis = Path(r"./drive/My Drive/00Tesis/")
    sys.path.insert(0,f"{FOLDER_thesis}/")
    import warnings
    warnings.filterwarnings("ignore")
else:
    FOLDER_thesis = Path(r"./")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
from helper.analysis import get_dataset_from_json
from helper.analysis import JSON_FOLDER

In [0]:
FOLDER = JSON_FOLDER

In [0]:
features_files = [file for file in FOLDER.iterdir() if file.name.startswith("features")]

In [0]:
results_all_corpora = defaultdict(pd.DataFrame)

for translator in ["Quixote","Ibsen"]:

    indexes = []  # file names as indices
    cols = ["Dimension", "SVC", "Naïve Bayes", "Decision Tree", "Logistic Regression"]
    results = []  # Where to hold the results
    
    for file in [file for file in features_files if file.name.split("_")[0].endswith(translator)]:

        X_dict, y_str = get_dataset_from_json(file)
        
        v = DictVectorizer(sparse=True)
        encoder = LabelEncoder()

        X = v.fit_transform(X_dict, )
        y = encoder.fit_transform(y_str)

        dimension = X.shape[1]

        kf = KFold(n_splits=10, shuffle=True, random_state=42)


        svm_model = Pipeline([("scaler", StandardScaler(with_mean=False)),
                              ("scv", LinearSVC(random_state=42))])
        cv_svm = cross_val_score(svm_model, X, y, cv=kf)
        #svm_y_pred = cross_val_predict(svm_model, X, y, cv=kf, n_jobs=-1)

        log_model = LogisticRegression(random_state=42)
        cv_log = cross_val_score(log_model, X, y, cv=kf)
        #log_y_pred = cross_val_predict(log_model, X, y, cv=kf, n_jobs=-1)

        nb_model = MultinomialNB()
        cv_nb = cross_val_score(nb_model, X.toarray(), y, cv=kf)
        #nb_y_pred = cross_val_predict(nb_model, X, y, cv=kf, n_jobs=-1)

        dt_model = DecisionTreeClassifier(random_state=42)
        cv_dt = cross_val_score(dt_model, X, y, cv=kf)
        #nb_y_pred = cross_val_predict(dt_model, X, y, cv=kf, n_jobs=-1)
        
        result_per_featureset = [dimension, cv_svm.mean(), cv_nb.mean(), cv_dt.mean(), cv_log.mean()]
        #print(result_per_featureset)
        
        results.append(result_per_featureset)        
        indexes.append(" ".join(file.stem.split("_")[1:]))
    
    #print(results)
    results_all_corpora[translator] = pd.DataFrame(np.array(results), index=indexes, columns = cols)

In [0]:
final = dict(results_all_corpora)

## Save results to CSV, $\LaTeX$, and HTML

In [0]:
RESULTS_FOLDER = FOLDER_thesis/"results/"

In [0]:
for translator in ["Quixote", "Ibsen"]:
    df = final[translator].sort_index()
    
    df.to_csv(f"{RESULTS_FOLDER/(translator +'_scaled_20200429_.csv')}", float_format='%.4f')
    
    latex = df.to_latex(float_format=lambda x: '%.4f' % x)
    with open(RESULTS_FOLDER/(translator+"_scaled_20200429_.tex"), "w") as f:
        f.write(latex)
    
    html = df.to_html(float_format='%.4f')
    with open(RESULTS_FOLDER/(translator+"_scaled_20200429_.html"), "w") as f:
        f.write(html)