<a href="https://colab.research.google.com/github/ccaballeroh/Translator-Attribution/blob/master/04PCA_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PCA Visualization

This Notebook generates 2-D visualization of the corpora using Principal Component Analysis for different feature sets.

The plots are saved in `./results/figs/pca/`

In [0]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [0]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/')
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")
    import warnings
    warnings.filterwarnings("ignore")
else:
    from helper.analysis import ROOT

In [0]:
import pickle
import platform

from helper.utils import plot_pca

First, we load from the `./auxfiles/pickle/` folder the corpora analyzed to a dictionary, where the name of the author is the key.

In [0]:
PICKLE = Path(fr"{ROOT}/auxfiles/pickle/")

In [0]:
docs = {}
for author in ["Quixote", "Ibsen"]:
    with open(PICKLE / f"{author}_{platform.system()}.pickle", "rb") as f:
        doc_data = f.read()
    docs[author] = pickle.loads(doc_data)

## Ibsen Corpus

The Ibsen corpus comprises two kinds of documents: one set of documents are translations from the same play, *Ghosts*; the other are different plays translated by two translators with no overlap in the translations.

So we can visualize each feature set for the parallel texts (i.e., *Ghosts*), for the other plays, and for all.

### First the parallel works:

#### Punctuation *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Punctuation unigrams Ibsen Ghosts", feature_selection=False, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Punctuation bigrams Ibsen Ghosts", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Punctuation trigrams Ibsen Ghosts", feature_selection=True, k=45)

#### Word *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Word unigrams Ibsen Ghosts", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Word bigrams Ibsen Ghosts", feature_selection=True, k=45)

#### POS *n*-grams

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS bigrams Ibsen Ghosts", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS trigrams Ibsen Ghosts", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS bigrams with punctuation Ibsen Ghosts", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS trigrams with punctuation Ibsen Ghosts", feature_selection=True, k=45)

#### Syntactic *n*-grams

In [0]:
features = [
    (doc.n_grams_syntactic(n=2, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Syntactic bigrams Ibsen Ghosts", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams_syntactic(n=3, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Syntactic trigrams Ibsen Ghosts", feature_selection=True, k=45)

#### Cohesive markers

In [0]:
features = [
    (doc.cohesive(punct=False), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Cohesive markers Ibsen Ghosts", feature_selection=True, k=45)

In [0]:
features = [
    (doc.cohesive(punct=True), doc.translator)
    for doc in docs["Ibsen"]
    if "Ghosts" in doc.filename
]

plot_pca(features=features, title="Cohesive markers with punctuation Ibsen Ghosts", feature_selection=True, k=45)

### Now, the other plays:

#### Punctuation *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Punctuation unigrams Ibsen other plays", feature_selection=False, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Punctuation bigrams Ibsen other plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Punctuation trigrams Ibsen other plays", feature_selection=True, k=45)

#### Word *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Word unigrams Ibsen other plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Word bigrams Ibsen other plays", feature_selection=True, k=45)

#### POS *n*-grams

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS bigrams Ibsen other plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS trigrams Ibsen other plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS bigrams with punctuation Ibsen other plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="POS trigrams with punctuation Ibsen other plays", feature_selection=True, k=45)

#### Syntactic *n*-grams

In [0]:
features = [
    (doc.n_grams_syntactic(n=2, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Syntactic bigrams Ibsen other plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams_syntactic(n=3, propn=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Syntactic trigrams Ibsen other plays", feature_selection=True, k=45)

#### Cohesive markers

In [0]:
features = [
    (doc.cohesive(punct=False), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Cohesive markers Ibsen other plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.cohesive(punct=True), doc.translator)
    for doc in docs["Ibsen"]
    if not "Ghosts" in doc.filename
]

plot_pca(features=features, title="Cohesive markers with punctuation Ibsen other plays", feature_selection=True, k=45)

### Lastly, all the plays together:

#### Punctuation *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Punctuation unigrams Ibsen all plays", feature_selection=False, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Punctuation bigrams Ibsen all plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Punctuation trigrams Ibsen all plays", feature_selection=True, k=45)

#### Word *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Word unigrams Ibsen all plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Word bigrams Ibsen all plays", feature_selection=True, k=45)

#### POS *n*-grams

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="POS bigrams Ibsen all plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="POS trigrams Ibsen all plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="POS bigrams with punctuation Ibsen all plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="POS trigrams with punctuation Ibsen all plays", feature_selection=True, k=45)

#### Syntactic *n*-grams

In [0]:
features = [
    (doc.n_grams_syntactic(n=2, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Syntactic bigrams Ibsen all plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams_syntactic(n=3, propn=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Syntactic trigrams Ibsen all plays", feature_selection=True, k=45)

#### Cohesive markers

In [0]:
features = [
    (doc.cohesive(punct=False), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Cohesive markers Ibsen all plays", feature_selection=True, k=45)

In [0]:
features = [
    (doc.cohesive(punct=True), doc.translator)
    for doc in docs["Ibsen"]
]

plot_pca(features=features, title="Cohesive markers with punctuation Ibsen all plays", feature_selection=True, k=45)

## *Don Quixote* corpus

This corpus comprises three parallel translations for the two parts of *Don Quixote*. So, we can just generate the PCA visualizations for all the feature sets.

#### Punctuation *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Punctuation unigrams Quixote", feature_selection=False, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Punctuation bigrams Quixote", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=False, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Punctuation trigrams Quixote", feature_selection=True, k=45)

#### Word *n*-grams

In [0]:
features = [
    (doc.n_grams(n=1, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Word unigrams Quixote", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=False, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Word bigrams Quixote", feature_selection=True, k=45)

#### POS *n*-grams

In [0]:
features = [
    (doc.n_grams(n=2, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="POS bigrams Quixote", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=False, pos=True, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="POS trigrams Quixote", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=2, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="POS bigrams with punctuation Quixote", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams(n=3, punct=True, pos=True, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="POS trigrams with punctuation Quixote", feature_selection=True, k=45)

#### Syntactic *n*-grams

In [0]:
features = [
    (doc.n_grams_syntactic(n=2, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Syntactic bigrams Quixote", feature_selection=True, k=45)

In [0]:
features = [
    (doc.n_grams_syntactic(n=3, propn=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Syntactic trigrams Quixote", feature_selection=True, k=45)

#### Cohesive markers

In [0]:
features = [
    (doc.cohesive(punct=False), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Cohesive markers Quixote", feature_selection=True, k=45)

In [0]:
features = [
    (doc.cohesive(punct=True), doc.translator)
    for doc in docs["Quixote"]
]

plot_pca(features=features, title="Cohesive markers with punctuation Quixote", feature_selection=True, k=45)