In [10]:
# 26/01/2024

# python -m spacy download en_core_web_sm --user

<span style="font-size: 20pt; font-weight: bold; color: #0098cd;">4.11 - Usar de NER</span>

# Parte 1: Carga y preprocesamiento del texto a analizar

In [11]:
import pathlib
import spacy
from spacy import displacy
import en_core_web_sm

In [12]:
### Importacion de otras bibliotecas que hemos necesitado.
import pandas as pd ### Para poder trabajar con dataframes que facilitan la visualizacion
import re ### Para realizar el preprocesamiento 

In [13]:
### Pre-procesamiento nltk: cada frase hasta un '.' en cada linea

import nltk
from nltk.tokenize import sent_tokenize

# Download the sentence tokenizer model (you only need to do this once)
nltk.download('punkt')

def process_text(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    with open(output_file, 'w', encoding='utf-8') as file:
        # Write each sentence to a new line in the output file
        file.write('\n'.join(sentences))

# Replace 'input.txt' with the path to your input file and 'output.txt' with your desired output file
process_text('barack-obama-speech.txt', 'nltk-output.txt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AlejandroDiezRedondo\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
### Pre-procesamiento vanilla: cada frase hasta un '.' en cada linea

# Hay problemas con cosas como: 
#   Sen. McCain --> utilizar nltk da mejores resultados (se queda aqui por verlo)

'''
def process_text(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()

    # Split the text into sentences based on periods and other punctuation
    # Strip removes code before and after the full sentence
    sentences = [sentence.strip()+'.' for sentence in text.split('.')]

    with open(output_file, 'w', encoding='utf-8') as file:
        # Write each sentence to a new line in the output file
        file.write('\n'.join(sentences))

# Replace 'input.txt' with the path to your input file and 'output.txt' with your desired output file
process_text('barack-obama-speech.txt', 'vanilla-output.txt')
'''

"\ndef process_text(input_file, output_file):\n    with open(input_file, 'r', encoding='utf-8') as file:\n        text = file.read()\n\n    # Split the text into sentences based on periods and other punctuation\n    sentences = [sentence.strip()+'.' for sentence in text.split('.')]\n\n    with open(output_file, 'w', encoding='utf-8') as file:\n        # Write each sentence to a new line in the output file\n        file.write('\n'.join(sentences))\n\n# Replace 'input.txt' with the path to your input file and 'output.txt' with your desired output file\nprocess_text('barack-obama-speech.txt', 'vanilla-output.txt')\n"

In [15]:
nlp = en_core_web_sm.load()
file_name = "barack-obama-speech.txt"
doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))

In [16]:
# Visualizar la información leida.
doc

Hello, Chicago.
If there is anyone out there who still doubts that America is a place where all things are possible, who still wonders if the dream of our founders is alive in our time, who still questions the power of our democracy, tonight is your answer.
It’s the answer told by lines that stretched around schools and churches in numbers this nation has never seen, by people who waited three hours and four hours, many for the first time in their lives, because they believed that this time must be different, that their voices could be that difference.
It’s the answer spoken by young and old, rich and poor, Democrat and Republican, black, white, Hispanic, Asian, Native American, gay, straight, disabled and not disabled. Americans who sent a message to the world that we have never been just a collection of individuals or a collection of red states and blue states.
We are, and always will be, the United States of America.
It’s the answer that led those who’ve been told for so long by so 

In [17]:
# Visualizar una parte concreta de la información 
doc[3:29] 

.
If there is anyone out there who still doubts that America is a place where all things are possible, who still wonders if

In [18]:
# Visualizar una parte de la información tal y como está guardada. 
doc[3:29].text

'.\nIf there is anyone out there who still doubts that America is a place where all things are possible, who still wonders if'

In [19]:
display('Visualizar el resultado con display:',doc[5:19].text)

'Visualizar el resultado con display:'

'If there is anyone out there who still doubts that America is a place'

In [20]:
print('Visualizar el resultado con print:\n',doc[5:19].text)

Visualizar el resultado con print:
 If there is anyone out there who still doubts that America is a place


In [21]:
print('El total de elementos contenidos en este fragmento de texto es: ', len(doc[5:19]))

# Visualizar exáctamente cuáles son esos elementos:
print('\nVisualizar exáctamente cuáles son esos elementos:')

for i in doc[5:19]:
    display(i.text)

El total de elementos contenidos en este fragmento de texto es:  14

Visualizar exáctamente cuáles son esos elementos:


'If'

'there'

'is'

'anyone'

'out'

'there'

'who'

'still'

'doubts'

'that'

'America'

'is'

'a'

'place'

<span style="font-size: 14pt; font-weight: bold; color: #0098cd;">¿Qué atributo del token contiene la etiqueta NER?</span>

In [22]:
print('El listado de entidades presentes en el texto junto con su correspondiente etiqueta es el siguente:\n')

for token in doc[:]:
    if token.ent_type_:
        print(f"{token.text} ({token.ent_type_})")

El listado de entidades presentes en el texto junto con su correspondiente etiqueta es el siguente:

Chicago (GPE)
America (GPE)
tonight (TIME)
three (TIME)
hours (TIME)
four (TIME)
hours (TIME)
first (ORDINAL)
Democrat (NORP)
Republican (NORP)
Hispanic (NORP)
Asian (NORP)
Native (NORP)
American (NORP)
Americans (NORP)
the (GPE)
United (GPE)
States (GPE)
of (GPE)
America (GPE)
tonight (TIME)
America (GPE)
McCain (PERSON)
McCain (PERSON)
America (GPE)
Palin (PERSON)
the (DATE)
months (DATE)
ahead (DATE)
Scranton (GPE)
Delaware (GPE)
the (GPE)
United (GPE)
States (GPE)
Joe (PERSON)
Biden (PERSON)
tonight (TIME)
the (DATE)
last (DATE)
16 (DATE)
years (DATE)
next (DATE)
first (DATE)
Michelle (PERSON)
Obama (PERSON)
Sasha (GPE)
Malia (PERSON)
White (ORG)
House (ORG)
tonight (TIME)
Maya (PERSON)
Alma (PERSON)
David (PERSON)
Plouffe (PERSON)
the (GPE)
United (GPE)
States (GPE)
of (GPE)
America (GPE)
David (PERSON)
Axelrod (PERSON)
Washington (GPE)
Des (GPE)
Moines (GPE)
Concord (GPE)
Charlest

In [23]:
# Visualizar las entidades, cada una de un color.
spacy.displacy.render(doc, style='ent',jupyter=True)

<span style="font-size: 14pt; font-weight: bold; color: #0098cd;">¿Qué entidades soporta Spacy?, ¿Qué significa cada una?</span>

In [24]:
# Obtener las etiquetas de las entidades
labels = nlp.get_pipe("ner").labels

# Imprimir las etiquetas y su significado
for label in labels:
    print(f'Entidad ---------> {label}: {spacy.explain(label)}')
    print('')


Entidad ---------> CARDINAL: Numerals that do not fall under another type

Entidad ---------> DATE: Absolute or relative dates or periods

Entidad ---------> EVENT: Named hurricanes, battles, wars, sports events, etc.

Entidad ---------> FAC: Buildings, airports, highways, bridges, etc.

Entidad ---------> GPE: Countries, cities, states

Entidad ---------> LANGUAGE: Any named language

Entidad ---------> LAW: Named documents made into laws.

Entidad ---------> LOC: Non-GPE locations, mountain ranges, bodies of water

Entidad ---------> MONEY: Monetary values, including unit

Entidad ---------> NORP: Nationalities or religious or political groups

Entidad ---------> ORDINAL: "first", "second", etc.

Entidad ---------> ORG: Companies, agencies, institutions, etc.

Entidad ---------> PERCENT: Percentage, including "%"

Entidad ---------> PERSON: People, including fictional

Entidad ---------> PRODUCT: Objects, vehicles, foods, etc. (not services)

Entidad ---------> QUANTITY: Measurements

<span style="font-size: 14pt; font-weight: bold; color: #0098cd;">¿Qué entidades diferentes son reconocidas en el texto?, ¿cuántas hay de cada tipo?</span>

In [25]:
# Se crea un diccionario vacio donde posteriormente se incluiran las entidades de cada tipo y su cantidad de apariciones.
dict_entidades_conteo = {}

# Como en pasos anteriores se recorren las entidades del objeto doc a través de un bucle for. 
for entidad in doc.ents:
    
    label = entidad.label_
    # Se incrementa cada vez más el contador para cada etiqueta
    dict_entidades_conteo[label] = dict_entidades_conteo.get(label, 0) + 1

print('Las entidades reconocidas en el texto junto con su correspondiente numero de apareciones vienen descritas en el siguente diccionario: \n')
print(dict_entidades_conteo)

Las entidades reconocidas en el texto junto con su correspondiente numero de apareciones vienen descritas en el siguente diccionario: 

{'GPE': 31, 'TIME': 18, 'ORDINAL': 2, 'NORP': 12, 'PERSON': 13, 'DATE': 15, 'ORG': 6, 'MONEY': 1, 'CARDINAL': 8, 'LOC': 2, 'FAC': 1}


In [26]:
# Para visualizar el contenido textual de cada entidad:
for token in doc:
    if token.ent_type_ == 'MONEY' or token.ent_type_ == 'LOC': 
        print(f'El ejemplo de ',token.ent_type_,'  es: ' ,token.text)

El ejemplo de  MONEY   es:  $
El ejemplo de  MONEY   es:  5
El ejemplo de  MONEY   es:  and
El ejemplo de  MONEY   es:  $
El ejemplo de  MONEY   es:  10
El ejemplo de  MONEY   es:  and
El ejemplo de  MONEY   es:  $
El ejemplo de  MONEY   es:  20
El ejemplo de  LOC   es:  Earth
El ejemplo de  LOC   es:  the
El ejemplo de  LOC   es:  moon


In [27]:
print('Ejemplos de la entidad GPE: \n')
for token in doc:
    if token.ent_type_ == 'GPE': 
        print(token.text)

Ejemplos de la entidad GPE: 

Chicago
America
the
United
States
of
America
America
America
Scranton
Delaware
the
United
States
Sasha
the
United
States
of
America
Washington
Des
Moines
Concord
Charleston
Iraq
Afghanistan
America
America
America
America
America
Atlanta
America
Birmingham
Selma
Atlanta
Berlin
America
America
America
the
United
States
of
America
