# Preprocessing

To remove numbers and brackets from Quixote corpus. Leave punctuation.

In [9]:
import os
import re

In [2]:
INPUT_FOLDER = '.\\Corpora\\Raw_Quixote\\'
OUTPUT_FOLDER = '.\\Corpora\\Proc_Quixote\\'
filenames = os.listdir(INPUT_FOLDER)

Find special characters used in the corpus in order to replace them

In [6]:
chars = set()
for file in filenames:
    with open(INPUT_FOLDER + file, "r") as f:
        text = f.read()
    chars = chars.union(set(text))

special = list(filter(lambda char: True if ord(char) > 127 else False, chars))

In [4]:
special

['’', 'ü', '“', '«', 'è', 'ë', 'ù', 'à', '”', 'û', 'ç', 'é', '‘', '—', 'â']

In [5]:
REPLACE = dict(zip(
    ['à', 'é', '’', '«', 'ë', '“', '‘', 'ù', 'ü', '”', '—', 'û', 'â', 'ç', 'è'], 
    ['a','e', "'",'"','e','"',"'",'u','u','"','-','u','a','z','e'],
)
              )
REPLACE

{'à': 'a',
 'é': 'e',
 '’': "'",
 '«': '"',
 'ë': 'e',
 '“': '"',
 '‘': "'",
 'ù': 'u',
 'ü': 'u',
 '”': '"',
 '—': '-',
 'û': 'u',
 'â': 'a',
 'ç': 'z',
 'è': 'e'}

In [29]:
def remove_numbers(text):
    """
    """
    pattern = re.compile(r"((-?\[\d+\]-?)|(-?\(\d+\)-?))")
    clean_text = pattern.sub(r" ", text)
    return clean_text

def collapse_spaces(text):
    """
    """
    pattern = re.compile(r"\s+")
    clean_text = pattern.sub(r" ", text)
    return clean_text


def remove_special(text, REPLACE):
    """
    """
    for char, subs in REPLACE.items():
        text = text.replace(char.lower(), subs)
    return text

In [8]:
for filename in filenames:
    with open(INPUT_FOLDER + filename, "r") as file:
        file_content = file.read()
    file_content = collapse_spaces(remove_numbers(file_content))
    file_content = remove_special(file_content, REPLACE)
    with open(OUTPUT_FOLDER + filename[:-4] + "_proc.txt", "w", encoding="UTF-8") as file:
        file.write(file_content)

# Ibsen

Preprocessing of Ibsen corpus

In [10]:
def remove_front_back_matter(input_folder, filename, output_folder):
    """Remove legal information from Project Gutenberg files.
    
    Reads the file with 'filename' in the 'input_folder' folder and
    outputs the same file with the "proc" word appended at the end
    of the filename in the 'output_folder', but without the lines at
    the beginning and at the end of the original file containing
    legal information from Project Gutenberg.
    
    :input_folder 'String' - name of the input folder
    :filename     'String' - name of the file to process
    :out_folder   'String' - name of the outout folder
    
    It returns None
    """
    
    lines = []
    write = False
    with open(input_folder + filename, "r", encoding="UTF-8") as f:
        for line in f:
            if line.strip().startswith("*** START OF"):
                write = True
            elif line.strip().startswith("*** END OF"):
                write = False
                break
            else:
                if write:
                    lines.append(line)
                else:
                    pass
                
    with open("".join([output_folder, filename[:-4], "_proc.txt"]), "a", encoding="UTF-8") as g:
        for line in lines:
            g.write(line)
    return None


def chunks(input_folder, filename, CHUNK_SIZE=5000):
    """Generator that yields the following chunk of the file.
    
    The output is a string with the following chunk size
    CHUNK_SIZE of the file 'filename' in the folder 'input folder'.
    
    :input_folder  'String' - name of input folder
    :filename      'String' - name of file to process
    :CHUNK_SIZE    'Integer' - size of chunk
    
    yields a 'String' of size of 'CHUNK_SIZE'
    """
    SIZE = os.stat(input_folder + filename).st_size  # filesize
    with open(input_folder + filename, "r", encoding="UTF-8") as f:
        for _ in range(SIZE//CHUNK_SIZE):
            # reads the lines that amount to the Chunksize
            # and yields a string 
            yield "".join(f.readlines(CHUNK_SIZE))

In [11]:
RAW_IBSEN_FOLDER = ".\\Corpora\\Raw_Ibsen\\"
PROC_IBSEN_FOLDER = ".\\Corpora\\Proc_Ibsen\\"

file_list = os.listdir(RAW_IBSEN_FOLDER)

In [12]:
file_list

['Archer_Ghosts.txt',
 'Archer_John_Gabriel_Borkman.txt',
 'Archer_Little_Eyolf.txt',
 'Archer_When_We_Dead_Awaken.txt',
 'Sharp_An_Enemy_Of_The_People.txt',
 'Sharp_Ghosts.txt',
 'Sharp_Pillars_Of_Society.txt',
 'Sharp_Rosmersholm.txt']

In [13]:
for file in file_list:
    remove_front_back_matter(RAW_IBSEN_FOLDER, file, PROC_IBSEN_FOLDER)

In [15]:
proc_file_list = os.listdir(PROC_IBSEN_FOLDER)
proc_file_list

['Archer_Ghosts_proc.txt',
 'Archer_John_Gabriel_Borkman_proc.txt',
 'Archer_Little_Eyolf_proc.txt',
 'Archer_When_We_Dead_Awaken_proc.txt',
 'Sharp_An_Enemy_Of_The_People_proc.txt',
 'Sharp_Ghosts_proc.txt',
 'Sharp_Pillars_Of_Society_proc.txt',
 'Sharp_Rosmersholm_proc.txt']

In [26]:
for file in proc_file_list:
    str_gen = chunks(PROC_IBSEN_FOLDER, file, CHUNK_SIZE=5000)
    num = 0
    for chunk in str_gen:
        num += 1
        with open(PROC_IBSEN_FOLDER + f"{file[:-4]}_part{num:03}.txt", "w") as f:
            f.write(chunk)    

In [30]:
file_list = os.listdir(PROC_IBSEN_FOLDER)
[file for file in file_list if not file[:-4].endswith("proc")][:10]

['Archer_Ghosts_proc_part001.txt',
 'Archer_Ghosts_proc_part002.txt',
 'Archer_Ghosts_proc_part003.txt',
 'Archer_Ghosts_proc_part004.txt',
 'Archer_Ghosts_proc_part005.txt',
 'Archer_Ghosts_proc_part006.txt',
 'Archer_Ghosts_proc_part007.txt',
 'Archer_Ghosts_proc_part008.txt',
 'Archer_Ghosts_proc_part009.txt',
 'Archer_Ghosts_proc_part010.txt']

In [33]:
INPUT_FOLDER = ".\\Corpora\\Proc_Ibsen\\"
filenames = os.listdir(INPUT_FOLDER)

OUTPUT_FOLDER = ".\\Corpora\\Proc_Ibsen_final\\"

In [34]:
chars = set()
for file in filenames:
    with open(INPUT_FOLDER + file, "r") as f:
        text = f.read()
    chars = chars.union(set(text))

special = list(filter(lambda char: True if ord(char) > 127 else False, chars))

In [35]:
special

['¶',
 '¼',
 'â',
 'ü',
 'ó',
 'ª',
 '«',
 '³',
 '¢',
 'ë',
 'Ã',
 '©',
 'ö',
 'é',
 'º',
 'ê',
 'ú']

In [36]:
REPLACE = dict(zip(['¶', '¼', 'â', 'ü', 'ó', 'ª', '«', '³', '¢', 'ë', 'Ã', '©', 'ö', 'é', 'º', 'ê', 'ú'], 
                  ['', '', 'a', 'u', 'o', 'a.', '"', '', '', 'e', 'A', '', 'o', 'e', 'o.', 'e', 'u']))
REPLACE

{'¶': '',
 '¼': '',
 'â': 'a',
 'ü': 'u',
 'ó': 'o',
 'ª': 'a.',
 '«': '"',
 '³': '',
 '¢': '',
 'ë': 'e',
 'Ã': 'A',
 '©': '',
 'ö': 'o',
 'é': 'e',
 'º': 'o.',
 'ê': 'e',
 'ú': 'u'}

In [37]:
for filename in [file for file in file_list if not file[:-4].endswith("proc")]:
    with open(INPUT_FOLDER + filename, "r") as file:
        file_content = file.read()
    file_content = collapse_spaces(remove_numbers(file_content))
    file_content = remove_special(file_content, REPLACE)
    with open(OUTPUT_FOLDER + filename[:-4] + "_proc.txt", "w", encoding="UTF-8") as file:
        file.write(file_content)