# Feature Extraction

With the processed documents stored in memory in a dictionary, we can generate feature JSON files using the custom function `save_dataset_to_json` available in the `analysis` submodule in the `helper` module. 

# 0. (Optional)

If running in colab, execute the following cells.

In [None]:
from pathlib import Path
import sys

IN_COLAB = "google.colab" in sys.modules

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive/', force_remount=True)
    ROOT = Path(r"./drive/My Drive/Translator-Attribution")
    sys.path.insert(0,f"{ROOT}/")

In [None]:
if IN_COLAB:
  !pip install spacy==2.2.2
  !python -m spacy download en_core_web_md
else:
  try:
    import spacy
    nlp = spacy.load("en_core_web_md")
  except:
    !python -m spacy download en_core_web_md

# 1. Retrieving Processed Documents from Disk

We can pick up the process from this step retrieving the processed documents from disk.

In [None]:
from helper import ROOT
from pathlib import Path
import pickle
import platform


PICKLE = Path(fr"{ROOT}/auxfiles/pickle/")
docs = {}

for author in ["Quixote", "Ibsen"]:
    with open(PICKLE/f"{author}_{platform.system()}.pickle", "rb") as f:
        doc_data=f.read()
    docs[author] = pickle.loads(doc_data)

## 1.1 For the *Quixote* corpus

In [None]:
from helper.analysis import save_dataset_to_json

author = "Quixote"

# syntactic n-grams with n in {2, 3}
for n in range(2,4):
    FILE_TEMPLATE = f"features_{author}_syntactic_n{n}"
    save_dataset_to_json([
        (doc.n_grams_syntactic(n=n), doc.translator) for doc in docs[author]
        ], FILE_TEMPLATE)

for punct in [True, False]:
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1,4):
        FILE_TEMPLATE = f"features_{author}_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=False), doc.translator) for doc in docs[author]
            ], FILE_TEMPLATE)
    # POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2,4):
        FILE_TEMPLATE = f"features_{author}_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator) for doc in docs[author]
            ], FILE_TEMPLATE)
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"features_{author}_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json([
            (doc.cohesive(punct=punct), doc.translator) for doc in docs[author]
            ], FILE_TEMPLATE)

## 1.2 For the Ibsen corpus

### 1.2.1 Extract features from parallel corpus (i.e., *Ghosts*)

In [None]:
from helper.analysis import save_dataset_to_json

author = "Ibsen"

# syntactic n-grams with n in {2, 3}
for n in range(2,4):
    FILE_TEMPLATE = f"Ibsen_Ghosts_syntactic_n{n}"
    save_dataset_to_json(
        [
        (doc.n_grams_syntactic(n=n, propn=False), doc.translator)
            for doc in docs[author]
            if "Ghosts" in doc.filename
        ], FILE_TEMPLATE
    )

for punct in [True, False]:
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1,4):
        FILE_TEMPLATE = f"Ibsen_Ghosts_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=False, propn=False), doc.translator)
            for doc in docs[author]
            if "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2,4):
        FILE_TEMPLATE = f"Ibsen_Ghosts_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator)
            for doc in docs[author]
            if "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"Ibsen_Ghosts_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.cohesive(punct=punct), doc.translator)
                for doc in docs[author]
                if "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )

### 1.2.2 Extract features from non-parallel corpus (i.e., the other plays)

In [None]:
from helper.analysis import save_dataset_to_json

author = "Ibsen"

# syntactic n-grams with n in {2, 3}
for n in range(2,4):
    FILE_TEMPLATE = f"Ibsen_Others_syntactic_n{n}"
    save_dataset_to_json(
        [
        (doc.n_grams_syntactic(n=n, propn=False), doc.translator)
            for doc in docs[author]
            if not "Ghosts" in doc.filename
        ], FILE_TEMPLATE
    )

for punct in [True, False]:
    # word n-grams with and without punctuation with n in {1, 2, 3}
    for n in range(1,4):
        FILE_TEMPLATE = f"Ibsen_Others_{n}grams{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=False, propn=False), doc.translator)
            for doc in docs[author]
            if not "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # POS n-grams with and without punctuation with n in {2, 3}
    for n in range(2,4):
        FILE_TEMPLATE = f"Ibsen_Others_{n}gramsPOS{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.n_grams(n=n, punct=punct, pos=True), doc.translator)
            for doc in docs[author]
            if not "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )
    # Cohesive markers with and without punctuation
    for _ in range(1):
        FILE_TEMPLATE = f"Ibsen_Others_cohesive{'_punct' if punct else ''}"
        save_dataset_to_json(
            [
            (doc.cohesive(punct=punct), doc.translator)
                for doc in docs[author]
                if not "Ghosts" in doc.filename
            ], FILE_TEMPLATE
        )

## 2. Cleaning (Optional)

We can delete from disk the files generated during the preprocessing and synctactic feature extraction steps in the folders `Corpora/Proc_{author}` and `auxfiles/txt/{author}` using the custom function `clean_files` in the `utils` submodule in the `helper` module. 

In [None]:
from helper.utils import clean_files

clean_files()