In [1]:
# Daniel Bandala @ oct 2022
from math import log
import numpy as np
from nltk import download, NLTKWordTokenizer
from nltk.tokenize import word_tokenize,sent_tokenize,TweetTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

# Information extraction
Information extraction is the task of automatically extracting structured information from unstructured and/or semi-structured machine-readable documents and other electronically represented sources. In most of the cases this activity concerns processing human language texts by means of natural language processing (NLP). Recent activities in multimedia document processing like automatic annotation and content extraction out of images/audio/video/documents could be seen as information extraction. Information extraction is the part of a greater puzzle which deals with the problem of devising automatic methods for text management, beyond its transmission, storage and display. The discipline of information retrieval (IR) has developed automatic methods, typically of a statistical flavor, for indexing large document collections and classifying documents. Another complementary approach is that of natural language processing (NLP) which has solved the problem of modelling human language processing with considerable success when taking into account the magnitude of the task. In terms of both difficulty and emphasis, IE deals with tasks in between both IR and NLP. In terms of input, IE assumes the existence of a set of documents in which each document follows a template, i.e. describes one or more entities or events in a manner that is similar to those in other documents but differing in the details. An example, consider a group of newswire articles on Latin American terrorism with each article presumed to be based upon one or more terroristic acts. We also define for any given IE task a template, which is a set of case frames to hold the information contained in a single document.

In [2]:
# download punkt tokenizer source
download('punkt')

[nltk_data] Downloading package punkt to /home/bandala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
doc1 = "No sé con qué armas se peleará la tercera guerra mundial, pero la cuarta se peleará con palos y piedras"
doc2 = "El fin de la segunda guerra mundial llegó con las bombas atómicas lanzadas en Japón."
doc3 = "La casa se está incendiando porque le cayeron bombas."

In [4]:
# tokenization
doc1_tok = word_tokenize(doc1)
doc2_tok = word_tokenize(doc2)
doc3_tok = word_tokenize(doc3)
print("Tokens document 1: ",len(doc1_tok))
print("Tokens document 2: ",len(doc2_tok))
print("Tokens document 3: ",len(doc3_tok))

Tokens document 1:  21
Tokens document 2:  16
Tokens document 3:  10


In [5]:
# stemming
stemmer = SnowballStemmer('spanish')
doc1_stem = [stemmer.stem(i) for i in doc1_tok]
doc2_stem = [stemmer.stem(i) for i in doc2_tok]
doc3_stem = [stemmer.stem(i) for i in doc3_tok]
print("Stems document 1: ",len(doc1_stem))
print("Stems document 2: ",len(doc2_stem))
print("Stems document 3: ",len(doc3_stem))

Stems document 1:  21
Stems document 2:  16
Stems document 3:  10


In [6]:
# conditioning
docs = [doc1_tok,doc2_tok,doc3_tok]
words = ["guerra","bombas","casa"]
C = len(docs)
W = len(words)

In [7]:
# number of occurrences
words_freq = [[docs[j].count(words[i]) for i in range(W)] for j in range(C)]
words_freq

[[1, 0, 0], [1, 1, 0], [0, 1, 1]]

In [8]:
# max occurrences
max_words_freq = [min(max(words_freq[i]),1) for i in range(C)]
max_words_freq

[1, 1, 1]

In [9]:
# frequency matrix
tf = [[docs[j].count(words[i])/max_words_freq[j] for j in range(C)] for i in range(W)]
print("Frequency matrix\n doc1 \t doc2 \t doc3 \n"+'\n'.join(['\t'.join([str(cell) for cell in row]) for row in tf]))

Frequency matrix
 doc1 	 doc2 	 doc3 
1.0	1.0	0.0
0.0	1.0	1.0
0.0	0.0	1.0


In [10]:
# inverse document frequency
words_freq = [[docs[i].count(words[j]) for i in range(C)] for j in range(W)]
docs_counter = [sum(words_freq[i]) for i in range(W)]
idf = [log(C/docs_counter[i]) for i in range(W)]
idf

[0.4054651081081644, 0.4054651081081644, 1.0986122886681098]

In [11]:
print("Inverse document frequency (IDF) of: ")
for i in range(W):
    print(f'{words[i]:>15}: {idf[i]:>10}' )

Inverse document frequency (IDF) of: 
         guerra: 0.4054651081081644
         bombas: 0.4054651081081644
           casa: 1.0986122886681098


In [12]:
# calculate tf-idf matrix
tf_idf = [[0 for _ in range(W)] for _ in range(C)] 
for i in range(W):
    for j in range(C):
        tf_idf[i][j] = tf[i][j] * idf[i]
# print matrix
print("TF-IDF Matrix\n doc1 \t\t doc2 \t\t doc3 \n"+'\n'.join(['\t\t'.join([str(round(cell,5)) for cell in row]) for row in tf_idf]))

TF-IDF Matrix
 doc1 		 doc2 		 doc3 
0.40547		0.40547		0.0
0.0		0.40547		0.40547
0.0		0.0		1.09861


In [13]:
# occurrences matrix
words = ["guerra","bombas","tercera"]
W = len(words)
occ_matrix = np.array([[docs[i].count(words[j]) for i in range(C)] for j in range(W)])
occ_matrix = np.c_[words,occ_matrix]
print("Occurrences matrix\n\t\t doc1 \t\t doc2 \t\t doc3 \n"+'\n'.join(['\t\t'.join([str(cell) for cell in row]) for row in occ_matrix]))

Occurrences matrix
		 doc1 		 doc2 		 doc3 
guerra		1		1		0
bombas		0		1		1
tercera		1		0		0


In [14]:
# boolean matrix for a words set
bool_matrix = np.array([[True if docs[i].count(words[j])>0 else False for i in range(C)] for j in range(W)])
bool_matrix = np.c_[words,bool_matrix]
print("Boolean matrix\n\t\t doc1 \t\t doc2 \t\t doc3 \n"+'\n'.join(['\t\t'.join([str(cell) for cell in row]) for row in bool_matrix]))

Boolean matrix
		 doc1 		 doc2 		 doc3 
guerra		True		True		False
bombas		False		True		True
tercera		True		False		False


In [15]:
print("Documents with all three words in set:\n")
for i in range(C):
    contains_words = False
    for j in range(W):
        contains_words = contains_words and bool_matrix[j][i]
    if (contains_words):
        print("Document ",i," contains all words")

Documents with all three words in set:

