# 1. Clean the text

In [None]:
# Read the file
with open('dirty_txt/eduardo_l_holmberg_horacio_kaliban.txt', 'r', encoding='utf-8') as file:
    text = file.readlines()

# Remove unnecessary line breaks and join split words
text_modified = ""
for line in text:
    line = line.strip()
    if line.endswith("¬"):
        text_modified += line[:-1]
    else:
        text_modified += line + " "

# Write the modified text back to the file
with open('clean_first_txt/eduardo_l_holmberg_horacio_kaliban_cor.txt', 'w', encoding='utf-8') as file:
    file.write(text_modified)

print("Process completed. The modified text has been saved in 'clean_first_txt/antonio_jose_valdes_delirio_II_cor.txt'.")

# 2. Tokenization with spaCy

In [1]:
# Tokenizacion

import spacy

# Load the spaCy Spanish model
nlp = spacy.load("es_core_news_sm")

# Read the text
with open('clean_first_txt/eduardo_l_holmberg_horacio_kaliban_cor.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Process the text with spaCy
doc = nlp(text)

# Tokens
tokens = [token.text for token in doc]

# Save tokens in a file
with open('spaCy_and_Flair/eduardo_l_holmberg_horacio_kaliban_tokenized.txt', 'w', encoding='utf-8') as f:
    for token in tokens:
        f.write(token + '\n')
        
print("The result has been saved in spaCy_and_Flair")

El resultado se ha guardado en annotated_with_Flair


# 3. NER with Flair

In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger

# Load Flair NER model
tagger = SequenceTagger.load("flair/ner-spanish-large")

# Read the tokens in the tokenized file 
with open('annotated_with_Flair/eduardo_l_holmberg_horacio_kaliban_tokenized.txt', 'r', encoding='utf-8') as file:
    tokens = file.readlines()

# Create sentences with the tokens 
sentences = [Sentence(token.strip()) for token in tokens]

# Predict NER tags for each sentence
for sentence in sentences:
    tagger.predict(sentence)

# Tag words and save in a file 
with open('spaCy_and_Flair/eduardo_l_holmberg_horacio_kaliban_annotated.txt', 'w', encoding='utf-8') as output_file:
    prev_tag = 'O'
    for token, sentence in zip(tokens, sentences):
        token = token.strip()
        tag = 'O'
        if any(span.tag == 'LOC' for span in sentence.get_spans('ner')):
            tag = 'B-LOC' if prev_tag == 'O' else 'I-LOC'
        output_file.write(token + ' ' + tag + '\n')
        prev_tag = tag

print("The result has been saved in spaCy_and_Flair")

2024-07-31 09:49:24,165 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-LOC, S-ORG, B-PER, I-PER, E-PER, S-MISC, B-ORG, E-ORG, S-PER, I-ORG, B-LOC, E-LOC, B-MISC, E-MISC, I-MISC, I-LOC, <START>, <STOP>
El resultado se ha guardado en annotated_with_Flair


# Correction

In [2]:
import re

def add_blank_lines_correctly(text):
    # Usar una expresión regular para encontrar las líneas con punto seguido de 'O'
    corrected_text = re.sub(r'(\. O)\n', r'\1\n\n', text)
    corrected_text = re.sub(r'(! O)\n', r'\1\n\n', corrected_text)
    corrected_text = re.sub(r'(\? O)\n', r'\1\n\n', corrected_text)
    corrected_text = re.sub(r'(; O)\n', r'\1\n\n', corrected_text)
    corrected_text = re.sub(r'(: O)\n', r'\1\n\n', corrected_text)
    return corrected_text

# Read the file
with open('spaCy_and_Flair/eduardo_l_holmberg_horacio_kaliban_annotated.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Correct the file
corrected_text = add_blank_lines_correctly(text)

# Save the new file
with open('spaCy_and_Flair/eduardo_l_holmberg_horacio_kaliban_annotated_corrected.txt', 'w', encoding='utf-8') as file:
    file.write(corrected_text)

print("The result has been saved in spaCy_and_Flair")

El archivo ha sido corregido y guardado
