# Feature Extraction

This notebook extracts several sets of features from the processed corpora.

In [None]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/', force_remount=True)
    ROOT = Path(r"./drive/My Drive/translator-attribution")
    sys.path.insert(0,f"{ROOT}/")
else:
    from helper import ROOT

## Retrieving Processed Documents from Disk

We can pick up the process from this step retrieving the processed documents from disk. Since the pickle file stores `Path` objects as properties of the documents, there's a difference between Windows and Linux (POSIX) paths.

In [None]:
from pathlib import Path
import pickle
import platform

PICKLE = Path(fr"{ROOT}/auxfiles/pickle/")
docs = {}

for author in ["Quixote", "Ibsen"]:
    with open(PICKLE/f"{author}_{platform.system()}.pickle", "rb") as f:
        doc_data = f.read()
    docs[author] = pickle.loads(doc_data)

### For the *Quixote* corpus

In [None]:
from helper.analysis import save_dataset_to_json

author = "Quixote"

# syntactic n-grams with n in {2,3}
for n in range(2, 4):
    FILE_TEMPLATE = f"{author}_syntactic_n{n}"
    save_dataset_to_json([
        (doc.n_grams_syntactic(n=n), doc.translator) for doc in docs[author]
    ], FILE_TEMPLATE)
    
for punct in [True, False]:
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1, 4):
        FILE_TEMPLATE = f"{author}_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=False), doc.translator) for doc in docs[author]
        ], FILE_TEMPLATE)
    # POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2, 4):
        FILE_TEMPLATE = f"{author}_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator) for doc in docs[author]
        ], FILE_TEMPLATE)
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"{author}_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.cohesive(punct=punct), doc.translator) for doc in docs[author]
        ], FILE_TEMPLATE)

### For the Ibsen Corpus

#### Extract Features from Parallel Corpus (i.e., *Ghosts*)

In [None]:
from helper.analysis import save_dataset_to_json

author = "Ibsen"

# syntactic n_grams with n in {2, 3}
for n in range(2, 4):
    FILE_TEMPLATE = f"{author}_Ghosts_syntactic_n{n}"
    save_dataset_to_json([
        (doc.n_grams_syntactic(n=n), doc.translator)
        for doc in docs[author]
        if "Ghosts" in doc.filename
    ], FILE_TEMPLATE)
    
for punct in [True, False]:
    
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1, 4):
        FILE_TEMPLATE = f"{author}_Ghosts_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=False, propn=False), doc.translator)
            for doc in docs[author]
            if "Ghosts" in doc.filename
        ], FILE_TEMPLATE)
        
    #POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2, 4):
        FILE_TEMPLATE = f"{author}_Ghosts_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator)
            for doc in docs[author]
            if "Ghosts" in doc.filename
        ], FILE_TEMPLATE)
        
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"{author}_Ghosts_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.cohesive(punct=punct), doc.translator)
            for doc in docs[author]
            if "Ghosts" in doc.filename
        ], FILE_TEMPLATE)

#### Extract Features from Non-Parallel Corpus (i.e., the other plays)

In [None]:
from helper.analysis import save_dataset_to_json

author = "Ibsen"

# syntactic n-grams with n in {2, 3}
for n in range(2, 4):
    FILE_TEMPLATE = f"{author}_Other_syntactic_n{n}"
    save_dataset_to_json([
        (doc.n_grams_syntactic(n=n, propn=False), doc.translator)
        for doc in docs[author]
        if not "Ghosts" in doc.filename
    ], FILE_TEMPLATE)

for punct in [True, False]:
    
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1, 4):
        FILE_TEMPLATE = f"{author}_Others_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=False, propn=False), doc.translator)
            for doc in docs[author]
            if not "Ghosts" in doc.filename
        ], FILE_TEMPLATE)
    
    # POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2, 4):
        FILE_TEMPLATE = f"{author}_Others_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator)
            for doc in docs[author]
            if not "Ghosts" in doc.filename
        ], FILE_TEMPLATE)
    
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"{author}_Others_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.cohesive(punct=punct), doc.translator)
            for doc in docs[author]
            if not "Ghosts" in doc.filename
        ], FILE_TEMPLATE)

## Cleaning (Optional)

We can delete from disk the files generated during the preprocessing and syntactic feature extraction steps in the folders `Corpora/Proc_{author}` and `auxfiles/txt/{author}` using the custom function `clean_files` in the `utils` submodule in the `helper` module.

In [None]:
from helper.utils import clean_files

clean_files()