# Preprocessing

## Quixote

To remove numbers and brackets from Quixote corpus. Leave punctuation.

In [37]:
import re
import os
from pathlib import Path

In [23]:
CORPORA = Path(r"./Corpora/")

In [23]:
INPUT_FOLDER = CORPORA/"Raw_Quixote/"
OUTPUT_FOLDER = CORPORA/"Proc_Quixote/"

if not OUTPUT_FOLDER.exists():
    OUTPUT_FOLDER.mkdir()

Find special characters used in the corpus in order to replace them

In [25]:
chars = set()
for file in INPUT_FOLDER.iterdir():
    with file.open("r") as f:
        text = f.read()
    chars = chars.union(set(text))

special = list(filter(lambda char: True if ord(char) > 127 else False, chars))

In [26]:
special

['ü', '—', 'è', '‘', '“', 'ù', 'û', '«', 'é', '’', 'â', 'ç', '”', 'ë', 'à']

In [27]:
REPLACE = dict(zip(
    ['à', 'é', '’', '«', 'ë', '“', '‘', 'ù', 'ü', '”', '—', 'û', 'â', 'ç', 'è'], 
    ['a','e', "'",'"','e','"',"'",'u','u','"','-','u','a','z','e'],
)
              )
REPLACE

{'à': 'a',
 'é': 'e',
 '’': "'",
 '«': '"',
 'ë': 'e',
 '“': '"',
 '‘': "'",
 'ù': 'u',
 'ü': 'u',
 '”': '"',
 '—': '-',
 'û': 'u',
 'â': 'a',
 'ç': 'z',
 'è': 'e'}

In [28]:
def remove_numbers(text):
    """
    """
    pattern = re.compile(r"((-?\[\d+\]-?)|(-?\(\d+\)-?))")
    clean_text = pattern.sub(r" ", text)
    return clean_text

def collapse_spaces(text):
    """
    """
    pattern = re.compile(r"\s+")
    clean_text = pattern.sub(r" ", text)
    return clean_text


def remove_special(text, REPLACE):
    """
    """
    for char, subs in REPLACE.items():
        text = text.replace(char.lower(), subs)
    return text

In [29]:
for filename in INPUT_FOLDER.iterdir():
    with filename.open("r") as file:
        file_content = file.read()
    file_content = collapse_spaces(remove_numbers(file_content))
    file_content = remove_special(file_content, REPLACE)
    with open(OUTPUT_FOLDER/(filename.stem + "_proc.txt"), "w", encoding="UTF-8") as file:
        file.write(file_content)

## Ibsen

Preprocessing of Ibsen corpus

In [40]:
def remove_front_back_matter(filename, output_folder):
    """Remove legal information from Project Gutenberg files.
    
    Reads the file with 'filename' in the 'input_folder' folder and
    outputs the same file with the "proc" word appended at the end
    of the filename in the 'output_folder', but without the lines at
    the beginning and at the end of the original file containing
    legal information from Project Gutenberg.
    
    :filename     'Path' - name of the file to process
    :out_folder   'Path' - name of the outout folder
    
    It returns None
    """
    
    lines = []
    write = False
    with open(filename, "r", encoding="UTF-8") as f:
        for line in f:
            if line.strip().startswith("*** START OF"):
                write = True
            elif line.strip().startswith("*** END OF"):
                write = False
                break
            else:
                if write:
                    lines.append(line)
                else:
                    pass
                
    with open(output_folder/(filename.stem + "_proc.txt"), "a", encoding="UTF-8") as g:
        for line in lines:
            g.write(line)
    return None


def chunks(filename, CHUNK_SIZE=5000):
    """Generator that yields the following chunk of the file.
    
    The output is a string with the following chunk size
    CHUNK_SIZE of the file 'filename' in the folder 'input folder'.
    
    :filename      'Path'    - name of file to process
    :CHUNK_SIZE    'Integer' - size of chunk
    
    yields a 'String' of size of 'CHUNK_SIZE'
    """
    SIZE = os.stat(filename).st_size  # filesize
    with open(filename, "r", encoding="UTF-8") as f:
        for _ in range(SIZE//CHUNK_SIZE):
            # reads the lines that amount to the Chunksize
            # and yields a string 
            yield "".join(f.readlines(CHUNK_SIZE))

In [41]:
RAW_IBSEN_FOLDER = CORPORA/"Raw_Ibsen/"
PROC_IBSEN_FOLDER = CORPORA/"Proc_Ibsen_/"

if not PROC_IBSEN_FOLDER.exists():
    PROC_IBSEN_FOLDER.mkdir()

In [42]:
for file in RAW_IBSEN_FOLDER.iterdir():
    remove_front_back_matter(file, PROC_IBSEN_FOLDER)

In [45]:
for file in [file for file in PROC_IBSEN_FOLDER.iterdir() if file.suffix == ".txt"]:
    str_gen = chunks(file, CHUNK_SIZE=5000)
    num = 0
    for chunk in str_gen:
        num += 1
        with open(PROC_IBSEN_FOLDER/(file.stem + f"_part{num:03}.txt"), "w") as f:
            f.write(chunk)    

In [55]:
INPUT_FOLDER = PROC_IBSEN_FOLDER
OUTPUT_FOLDER = CORPORA/"Proc_Ibsen_final_/"

if not OUTPUT_FOLDER.exists():
    OUTPUT_FOLDER.mkdir()

In [56]:
chars = set()
for file in [file for file in INPUT_FOLDER.iterdir() 
             if not file.stem.endswith("proc")
             and file.suffix == ".txt"]:
    with open(file, "r") as f:
        text = f.read()
    chars = chars.union(set(text))

special = list(filter(lambda char: True if ord(char) > 127 else False, chars))

In [57]:
special

['ê', 'ü', 'é', 'â', 'ú', 'ó', 'ö', 'ë']

In [58]:
REPLACE = dict(zip(['ê', 'ü', 'é', 'â', 'ú', 'ó', 'ö', 'ë'], 
                  ['e', 'u', 'e', 'a', 'u', 'o', 'o', 'e']))

In [60]:
for file in [file for file in INPUT_FOLDER.iterdir() 
                 if not file.stem.endswith("proc")
                 and file.suffix == ".txt"]:
    with open(file, "r") as f:
        file_content = f.read()
    file_content = collapse_spaces(remove_numbers(file_content))
    file_content = remove_special(file_content, REPLACE)
    with open(OUTPUT_FOLDER/(file.stem + "_proc.txt"), "w", encoding="UTF-8") as f:
        f.write(file_content)