In [1]:
%%capture
!pip install PyPDF2
!pip install datasets
!pip install torch
!pip install -U accelerate
!pip install -U transformers
!pip install -U datasets
!pip install transformers[torch]

In [2]:
from utils_ner import extract_text_from_pdf
from utils_ner import PyPDF2
from utils_ner import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, tokenize_and_align_tags, TrainingArguments, Trainer, TrainingArguments, pipeline




In [3]:
pdf_file_path = "el-amante-japones-isabel-allende.pdf"
extracted_text = extract_text_from_pdf(pdf_file_path)

In [4]:
from datasets import load_dataset

# Load the CoNLL 2003 dataset
conll_dataset = load_dataset("conll2002", 'es')
train_data = conll_dataset["train"]
validation_data = conll_dataset["validation"]
test_data = conll_dataset["test"]

Downloading data: 100%|██████████| 1.21M/1.21M [00:01<00:00, 750kB/s]
Downloading data: 100%|██████████| 251k/251k [00:00<00:00, 1.41MB/s]
Downloading data: 100%|██████████| 237k/237k [00:00<00:00, 654kB/s]


Generating train split:   0%|          | 0/8324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1916 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1518 [00:00<?, ? examples/s]

In [5]:
tokenized_conll = conll_dataset.map(tokenize_and_align_tags, batched=True)

Map:   0%|          | 0/8324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1518 [00:00<?, ? examples/s]

In [6]:
tag_names = conll_dataset["test"].features[f"ner_tags"].feature.names
id2label = dict(enumerate(tag_names))
label2id = dict(zip(id2label.values(), id2label.keys()))

In [7]:
model = AutoModelForTokenClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-uncased', num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir='/ner_model_4',
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [10]:
from utils_ner import tokenizer, data_collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_conll["train"],
    eval_dataset=tokenized_conll["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [11]:
model = AutoModelForTokenClassification.from_pretrained('ner_model_2/checkpoint-3123')
tokenizer = AutoTokenizer.from_pretrained('ner_model_2/checkpoint-3123')

ner = pipeline("ner", model=model, tokenizer=tokenizer)

In [12]:
from utils import unicodedata, re, nltk, snowball_stemmer

text_lower = extracted_text.lower()
text_normalized = unicodedata.normalize('NFKD', text_lower).encode('ascii', 'ignore').decode('utf-8')
filtered_text = re.sub('[^A-Za-z0-9\s]', ' ', text_normalized)
#filtered_text = re.sub('[^A-Za-z0-9áéíóúÁÉÍÓÚñÑüÜ\s]', ' ',text_lower)
tokens = nltk.word_tokenize(filtered_text)
tokens_no_stopwords = [word for word in tokens if word not in nltk.corpus.stopwords.words('spanish')]
stemmed_words = [snowball_stemmer.stem(word) for word in tokens_no_stopwords]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CarolinaSoria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CarolinaSoria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CarolinaSoria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\CarolinaSoria\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CarolinaSoria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CarolinaSoria\AppData\Roaming\nltk_data...
[nltk_data

In [7]:
#text = filtered_text.replace('\n', '')
ner_results_1 = ner(filtered_text[0:2000])

In [8]:
ner_results_2 = ner(filtered_text[2000:4000])

In [9]:
ner_results_3 = ner(filtered_text[4000:6000])

In [10]:
def concatenate_entities(tag_results):
    concatenated_entities = []
    current_entity = None
    current_entity_text = ""
    for result in tag_results:
        if result['entity'] in ['B-PER', 'I-PER']:
            if current_entity in ['B-PER', 'I-PER'] and result['entity'] == 'I-PER':
                current_entity_text += result['word'].lstrip('▁')
            else:
                if current_entity is not None:
                    concatenated_entities.append(current_entity_text)
                current_entity = result['entity']
                current_entity_text = result['word'].lstrip('▁')
        else:
            if current_entity in ['B-PER', 'I-PER']:
                concatenated_entities.append(current_entity_text)
            current_entity = None
            current_entity_text = ""
    if current_entity in ['B-PER', 'I-PER']:
        concatenated_entities.append(current_entity_text)
    return concatenated_entities

In [11]:
def get_entity_types(ner_results):
    entity_types = set()
    for result in ner_results:
        entity_types.add(result['entity'])
    return entity_types

get_entity_types(ner_results_1)

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER'}

In [12]:
ner_results_3

[{'entity': 'B-LOC',
  'score': 0.98319006,
  'index': 13,
  'word': '▁cuba',
  'start': 42,
  'end': 46},
 {'entity': 'B-LOC',
  'score': 0.97265595,
  'index': 22,
  'word': '▁ber',
  'start': 79,
  'end': 82},
 {'entity': 'B-LOC',
  'score': 0.945956,
  'index': 23,
  'word': 'ke',
  'start': 82,
  'end': 84},
 {'entity': 'B-LOC',
  'score': 0.96639454,
  'index': 24,
  'word': 'ley',
  'start': 84,
  'end': 87},
 {'entity': 'B-PER',
  'score': 0.97835314,
  'index': 61,
  'word': '▁sen',
  'start': 265,
  'end': 268},
 {'entity': 'B-PER',
  'score': 0.97493297,
  'index': 62,
  'word': 'or',
  'start': 268,
  'end': 270},
 {'entity': 'B-PER',
  'score': 0.92876166,
  'index': 94,
  'word': '▁tol',
  'start': 435,
  'end': 438},
 {'entity': 'B-PER',
  'score': 0.8814607,
  'index': 95,
  'word': 'k',
  'start': 438,
  'end': 439},
 {'entity': 'B-PER',
  'score': 0.71108973,
  'index': 96,
  'word': 'ien',
  'start': 439,
  'end': 442},
 {'entity': 'B-PER',
  'score': 0.8217824,
  'i

In [13]:
len(filtered_text)

499183