# Corpus preprocessing

The corpus files are located in the *Raw_Corpus* folder. Those were downloaded from the Project Gutenberg webpage.

In [1]:
import os
import nltk
from nltk import tokenize

raw_corpus_folder = ".\\Corpora\\Raw_Ibsen\\"
proc_corpus_folder = ".\\Corpora\\Proc_Ibsen\\"

files_list = os.listdir(raw_corpus_folder)

In [2]:
def remove_front_back_matter(input_folder, filename, output_folder):
    """Remove legal information from Project Gutenberg files.
    
    Reads the file with 'filename' in the 'input_folder' folder and
    outputs the same file with the "proc" word appended at the end
    of the filename in the 'output_folder', but without the lines at
    the beginning and at the end of the original file containing
    legal information from Project Gutenberg.
    
    :input_folder 'String' - name of the input folder
    :filename     'String' - name of the file to process
    :out_folder   'String' - name of the outout folder
    
    It returns None
    """
    
    lines = []
    write = False
    with open(input_folder + filename, "r", encoding="UTF-8") as f:
        for line in f:
            if line.strip().startswith("*** START OF"):
                write = True
            elif line.strip().startswith("*** END OF"):
                write = False
                break
            else:
                if write:
                    lines.append(line)
                else:
                    pass
                
    with open("".join([output_folder, filename[:-4], "_proc.txt"]), "a", encoding="UTF-8") as g:
        for line in lines:
            g.write(line)
    return None

We remove the front and back matter for each file in the *Raw_Corpus* folder. We place the outputs on the *proc_corpus_folder*.

In [3]:
for file in files_list:
    remove_front_back_matter(raw_corpus_folder, file, proc_corpus_folder)

In [3]:
def chunks(input_folder, filename, CHUNK_SIZE=5000):
    """Generator that yields the following chunk of the file.
    
    The output is a string with the following chunk size
    CHUNK_SIZE of the file 'filename' in the folder 'input folder'.
    
    :input_folder  'String' - name of input folder
    :filename      'String' - name of file to process
    :CHUNK_SIZE    'Integer' - size of chunk
    
    yields a 'String' of size of 'CHUNK_SIZE'
    """
    SIZE = os.stat(input_folder + filename).st_size  # filesize
    with open(input_folder + filename, "r", encoding="UTF-8") as f:
        for _ in range(SIZE//CHUNK_SIZE):
            # reads the lines that amount to the Chunksize
            # and yields a string 
            yield "".join(f.readlines(CHUNK_SIZE))

In [4]:
def remover_signos(strTexto):
    """Retira signos no alfanuméricos y números de un texto.

    Los reemplaza con un espacio y regresa una lista con cada palabra.

    Parámetros
    --------------
    strTexto : cadena
                Texto donde reemplazar los caracteres no alfanuméricos.

    Regresa
    --------------
    lstTokens : lista
                cada elemento es una palabra del texto.
    """
    import re
    # patrón de uno o más caracteres no alfanuméricos o números
    reNoAlfanum = re.compile(r"([^\w]+|[\d]+)")
    # sustitución por un espacio
    strTextoNuevo = reNoAlfanum.sub(r" ", strTexto)
    # división del texto por espacio en blanco
    lstTokens = strTextoNuevo.split()
    return lstTokens

def calcula_ngramas(lstTokens, n):
    """Calcula n-gramas de una lista de tokens.

    Regresa una lista con los n-gramas.

    Parámetros
    -------------
    lstTokens : lista
                Cada elemento de es un token.
    n : entero
        tamaño del n-grama

    Regresa
    -------------
    ngramas : lista
              Cada elemento es un n-grama."""
    ngramas = []
    for i in range(len(lstTokens)+1-n):
        ngrama = []
        for j in range(i, i+n):
            ngrama.append(lstTokens[j])
        ngramas.append(" ".join(ngrama))
    return ngramas

def frecuencias_ngramas(strTexto, n, tokenizer=remover_signos):
    """Número de ocurrencias de cada n-grama de un texto.

    Retira los signos de puntuación y símbolos. Además no distingue
    mayúscula o minúscula.

    Parámetros
    ------------------
    strTexto : cadena
                Texto donde hacer el conteo.

    n : tamaño del ngrama

    Regresa
    -----------------
    dicFrecuencias : diccionario
    las llaves del diccionario son los ngramas, y los valores,
    el número de ocurrencias de cada ngrama respectivo.
    """
    dicFrecuencias = {}  # diccionario donde se guardará la salida

    lstTokens = tokenizer(strTexto)  # se remueven signos del texto
    ngramas = calcula_ngramas(lstTokens, n)  # lista con ngramas

    for elemento in ngramas:
        # si el ngrama no está en el diccionario ya, se agrega con valor de 1.
        # en caso contrario, se le suma un 1 al número de ocurrencias.
        if elemento.lower() not in dicFrecuencias:
            dicFrecuencias[elemento.lower()] = 1
        else:
            dicFrecuencias[elemento.lower()] += 1
    return dicFrecuencias

In [5]:
def extract_features(text, n):
    """Extract features from a text string.
    
    :text  'String' - contains the text from where to extract features
    """
    f = frecuencias_ngramas(text, n, tokenizer=tokenize.word_tokenize)
    return f

In [6]:
def get_translator(filename):
    """Get name of translator from file name.
    
    The filename must have the convention: '{Translator}_{Work}[_proc].txt'
    
    :filename  'String' - Name of file from which to fetch the translator name
    
    Returns a 'String' with the translator name.
    """
    return filename.split("_")[0]

In [8]:
def create_dataset(proc_corpus_folder, word="proc", n=1):
    """Create dataset from collections of files with a word in common in the name.
    
    Creates a dataset from the files contained in the folder 'proc_corpus_folder'.
    The dataset is a list of tuples, where the first element of the tuple is features
    extracted from the text and the second element is the name of the translator.
    
    :proc_corpus_folder  'String' - folder containing the files to analyze.
    :word                'String' - word that the files must have in common
                                    "proc" is a default common word of all
                                    processed text files.
    
    Returns a list of the form [({features}, translator, ...]"""
    proc_files_list = os.listdir(proc_corpus_folder)
    dataset = []
    for file in (filename for filename in proc_files_list if word in filename):
        generator = chunks(proc_corpus_folder, file)
        dataset.extend([(extract_features(text, n), get_translator(file)) for text in generator])
    return dataset

# SpaCy test

In [7]:
import spacy

nlp = spacy.load("en_core_web_md")

In [8]:
proc_files_list = os.listdir(proc_corpus_folder)

In [9]:
file = proc_files_list[0]

In [10]:
generator = chunks(proc_corpus_folder, file, CHUNK_SIZE=500)

In [16]:
string = generator.__next__()
print(string)

This passage is interesting as showing clearly the point of view from
which Ibsen conceived the character of Manders. In the next paragraph
of the same letter he discusses the attitude of "the so-called Liberal
press"; but as the paragraph contains the germ of _An Enemy of the
People_, it may most fittingly be quoted in the introduction to that
play.

Three days later (January 6) Ibsen wrote to Schandorph, the Danish
novelist: "I was quite prepared for the hubbub. If certain of our
Scandinavian reviewers have no talent for anything else, they have



In [17]:
doc = nlp(string)

In [18]:
with open(".\\auxfiles\\csv\\prueba.csv", "w", encoding="UTF-8") as f:
    print("\t".join("TEXT POS LEMMA TAG DEP".split()), file=f)
    for token in doc:
        print("\t".join([token.text, token.pos_, token.lemma_, token.tag_, token.dep_]), file=f)

In [20]:
doc = nlp("Mary loves John, but she does not love him back.")
with open(".\\auxfiles\\json\\prueba.json", "w") as f:
    print(str(doc.to_json()).replace("'", '"'), file=f)