# Contextualización Ciencia de datos e inteligencia artificial.

Autor              : Gilbert L. Bothia
#### Material Referencia: Ver bibliografía de presentación.

## Procesamiento en lenguage natural (NPL)


In [3]:
import nltk
import re

In [4]:
# Expresiones regulares

# Link consulta https://docs.python.org/3/library/re.html
# https://www.nltk.org/api/nltk.corpus.reader.html


corpus = nltk.corpus.cess_esp.sents() 
print(corpus)
print(len(corpus))  # no te que es una lista anidada

[['El', 'grupo', 'estatal', 'Electricité_de_France', '-Fpa-', 'EDF', '-Fpt-', 'anunció', 'hoy', ',', 'jueves', ',', 'la', 'compra', 'del', '51_por_ciento', 'de', 'la', 'empresa', 'mexicana', 'Electricidad_Águila_de_Altamira', '-Fpa-', 'EAA', '-Fpt-', ',', 'creada', 'por', 'el', 'japonés', 'Mitsubishi_Corporation', 'para', 'poner_en_marcha', 'una', 'central', 'de', 'gas', 'de', '495', 'megavatios', '.'], ['Una', 'portavoz', 'de', 'EDF', 'explicó', 'a', 'EFE', 'que', 'el', 'proyecto', 'para', 'la', 'construcción', 'de', 'Altamira_2', ',', 'al', 'norte', 'de', 'Tampico', ',', 'prevé', 'la', 'utilización', 'de', 'gas', 'natural', 'como', 'combustible', 'principal', 'en', 'una', 'central', 'de', 'ciclo', 'combinado', 'que', 'debe', 'empezar', 'a', 'funcionar', 'en', 'mayo_del_2002', '.'], ...]
6030


In [5]:
flatten = [k for l in corpus for k in l]
print(flatten[:20])
print(len(flatten))

['El', 'grupo', 'estatal', 'Electricité_de_France', '-Fpa-', 'EDF', '-Fpt-', 'anunció', 'hoy', ',', 'jueves', ',', 'la', 'compra', 'del', '51_por_ciento', 'de', 'la', 'empresa', 'mexicana']
192686


In [6]:
# función re.search(), determina conincidencia de patrón en cadena de strings
arr = [k for k in flatten if re.search('ru', k)]
arr


['grupo',
 'construcción',
 'construcción',
 'construir',
 'grupo',
 'grupo',
 'grupo',
 'Grupo_Mundial',
 'escrutado',
 'La_Hiruela',
 'pruebas',
 'pruebas',
 'rusa',
 'rueda',
 'ruso',
 'rusas',
 'ruso',
 'noruegos',
 'rusa',
 'rusa',
 'noruega',
 'rusas',
 'noruega',
 'rusa',
 'Grupo_de_los_Tres',
 'corrupción',
 'estructuras',
 'corrupción',
 'estructurales',
 'destruir',
 'ruta',
 'destruir',
 'rutas',
 'anticorrupción',
 'prueba',
 'grupos',
 'La_Coruña',
 'Anna_Birulés',
 'fruto',
 'reestructuración',
 'reestructuración',
 'rumanos',
 'rumanos',
 'pruebas',
 'grupos',
 'rumores',
 'rumano',
 'rublo-dólar',
 'Anna_Birulés',
 'Anna_Birulés',
 'Birulés',
 'Bruselas',
 'grupo',
 'crudo',
 'crudo',
 'crudo',
 'prueba',
 'prueba',
 'grupo',
 'Grupo',
 'Grupo',
 'abrumadora',
 'infraestructuras',
 'uruguayo',
 'Uruguay',
 'infraestructura',
 'uruguayo',
 'rueda',
 'rueda',
 'ruedas',
 'abrupta',
 'infraestructuras',
 'construidas',
 'Cruz_Roja',
 'Federación_Internacional_de_la_Cruz_Ro

In [7]:
# Rangos []
arr = [k for k in flatten if re.search('^[ghi][mno][jlk][def]$', k)]
arr

['golf', 'golf']

In [8]:
# Tokenización
txtShort = "Si he logrado   ver más lejos, ha sido porque he subido a hombros de gigantes, Newton (price : $15.50 )"
print(re.split(r' ', txtShort))
print(re.split(r'[ \t\n]+', txtShort))
print(re.split(r'[\w]+', txtShort))




['Si', 'he', 'logrado', '', '', 'ver', 'más', 'lejos,', 'ha', 'sido', 'porque', 'he', 'subido', 'a', 'hombros', 'de', 'gigantes,', 'Newton', '(price', ':', '$15.50', ')']
['Si', 'he', 'logrado', 'ver', 'más', 'lejos,', 'ha', 'sido', 'porque', 'he', 'subido', 'a', 'hombros', 'de', 'gigantes,', 'Newton', '(price', ':', '$15.50', ')']
['', ' ', ' ', '   ', ' ', ' ', ', ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ', ', ' (', ' : $', '.', ' )']


In [9]:
pattern = r'''(?x)                 # set flag to allow verbose regexps
              (?:[A-Z]\.)+         # abbreviations, e.g. U.S.A.
              | \w+(?:-\w+)*       # words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
              | \.\.\.             # ellipsis
              | [][.,;"'?():-_`]   # these are separate tokens; includes ], [
'''
nltk.regexp_tokenize(txtShort, pattern)

['Si',
 'he',
 'logrado',
 'ver',
 'más',
 'lejos',
 ',',
 'ha',
 'sido',
 'porque',
 'he',
 'subido',
 'a',
 'hombros',
 'de',
 'gigantes',
 ',',
 'Newton',
 '(',
 'price',
 ':',
 '$15.50',
 ')']

In [10]:
# Lematización (raices)
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer #SnowballStemmer.languages


In [11]:
stem = SnowballStemmer('english')
print(stem.stem('worked'))
stem = SnowballStemmer('spanish')
print(stem.stem('Trabajando'))

work
trabaj


In [12]:
# Vocabulario
vocabulario = sorted(set(flatten))
print(vocabulario)

# metrica de texto: riqueza léxica:
rl = len(set(flatten))/len(flatten)
print(rl)

0.13215282895487995


In [13]:
# metrica de texto: frecuencia por palabra 
fdist = nltk.FreqDist(flatten)
fdist.most_common(20)

[(',', 11420),
 ('de', 10234),
 ('la', 6412),
 ('.', 5866),
 ('que', 5552),
 ('el', 5199),
 ('en', 4340),
 ('y', 4235),
 ('*0*', 3883),
 ('"', 3038),
 ('los', 2963),
 ('a', 2953),
 ('del', 2257),
 ('se', 1884),
 ('las', 1832),
 ('un', 1815),
 ('con', 1494),
 ('por', 1456),
 ('una', 1396),
 ('su', 1291)]

## Redes para NPL


In [35]:
import tensorflow as tf
import numpy as np
import pandas as pd    
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

In [42]:
# Valoración de mensajes "English Twitter", clasificados en 6 emociones: anger, fear, joy, love, sadness, and surprise
# Fuente : https://huggingface.co/datasets/emotion.

dataset = pd.read_json(path_or_buf='C:/Users/glbot/Downloads/data/data.jsonl', lines=True)
train = pd.read_json(path_or_buf='C:/Users/glbot/Downloads/train/train.jsonl', lines=True)
val = pd.read_json(path_or_buf='C:/Users/glbot/Downloads/validation/validation.jsonl', lines=True)
test = pd.read_json(path_or_buf='C:/Users/glbot/Downloads/test/test.jsonl', lines=True)


In [43]:
val

Unnamed: 0,text,label
0,im feeling quite sad and sorry for myself but ...,0
1,i feel like i am still looking at a blank canv...,0
2,i feel like a faithful servant,2
3,i am just feeling cranky and blue,3
4,i can have for a treat or if i am feeling festive,1
...,...,...
1995,im having ssa examination tomorrow in the morn...,0
1996,i constantly worry about their fight against n...,1
1997,i feel its important to share this info for th...,1
1998,i truly feel that if you are passionate enough...,1


In [44]:
def get_tweet(data):
    tweets =data['text']
    labels = data['label'] 
    return tweets, labels
tweets, labels = get_tweet(train)


In [45]:
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')   # palabras fuera de vocabulario se reemplazan por oov_token https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
tokenizer.fit_on_texts(tweets)
tokenizer.word_index

{'<UNK>': 1,
 'i': 2,
 'feel': 3,
 'and': 4,
 'to': 5,
 'the': 6,
 'a': 7,
 'feeling': 8,
 'that': 9,
 'of': 10,
 'my': 11,
 'in': 12,
 'it': 13,
 'like': 14,
 'so': 15,
 'for': 16,
 'im': 17,
 'me': 18,
 'but': 19,
 'was': 20,
 'have': 21,
 'is': 22,
 'this': 23,
 'am': 24,
 'with': 25,
 'not': 26,
 'about': 27,
 'be': 28,
 'as': 29,
 'on': 30,
 'you': 31,
 'just': 32,
 'at': 33,
 'when': 34,
 'or': 35,
 'all': 36,
 'because': 37,
 'more': 38,
 'do': 39,
 'can': 40,
 'really': 41,
 'up': 42,
 't': 43,
 'are': 44,
 'by': 45,
 'very': 46,
 'know': 47,
 'been': 48,
 'if': 49,
 'out': 50,
 'myself': 51,
 'time': 52,
 'how': 53,
 'what': 54,
 'get': 55,
 'little': 56,
 'had': 57,
 'now': 58,
 'will': 59,
 'from': 60,
 'being': 61,
 'they': 62,
 'people': 63,
 'them': 64,
 'would': 65,
 'he': 66,
 'want': 67,
 'her': 68,
 'some': 69,
 'think': 70,
 'one': 71,
 'still': 72,
 'ive': 73,
 'him': 74,
 'even': 75,
 'who': 76,
 'an': 77,
 'life': 78,
 'its': 79,
 'make': 80,
 'there': 81,
 'we': 

In [49]:
print(tweets[2] )
print(tokenizer.texts_to_sequences(tweets[2]))

im grabbing a minute to post i feel greedy wrong
[[2], [93], [], [2952], [1794], [7], [1166], [1166], [2], [1726], [2952], [], [7], [], [93], [2], [1726], [966], [43], [1550], [], [43], [2229], [], [1871], [2229], [90], [43], [], [2], [], [1997], [1550], [1550], [8884], [], [2952], [1794], [1550], [1550], [669], [2607], [], [2008], [1794], [2229], [1726], [2952]]


In [50]:
maxlen=50
def get_sequences(tokenizer, tweets):
    sequences = tokenizer.texts_to_sequences(tweets)  #  toma el texto y lo presenta como una secuencia (construye el vocabulario)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)   # matrix 2D de enteros con la secuencia indicada (#muestras vocabulario x maxlen columnas https://www.tensorflow.org/api_docs/python/tf/keras/utils/pad_sequences
    return padded

In [53]:
padded_train_seq = get_sequences(tokenizer, tweets)
padded_train_seq.shape



(16000, 50)

In [58]:
classes = set(labels)
class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])
train_labels = names_to_ids(labels)
print(classes)

{0, 1, 2, 3, 4, 5}


In [62]:
index_to_class


{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}

In [22]:
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(10000,16,input_length=maxlen),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(
     loss='sparse_categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)





In [23]:
val_tweets, val_labels = get_tweet(val)
val_seq = get_sequences(tokenizer, val_tweets)
val_labels= names_to_ids(val_labels)
h = model.fit(
     padded_train_seq, train_labels,
     validation_data=(val_seq, val_labels),
     epochs=2,
     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
)

Epoch 1/2


Epoch 2/2


In [24]:
test_tweets, test_labels=get_tweet(test)
test_seq = get_sequences(tokenizer, test_tweets)
test_labels=names_to_ids(test_labels)
model.evaluate(test_seq, test_labels)   # https://www.tensorflow.org/api_docs/python/tf/keras/Model#evaluate



[0.5595746040344238, 0.796999990940094]

In [29]:
i = random.randint(0,len(test_labels)-1)
print('Sentence:', test_tweets[i])
print('Emotion:', index_to_class[test_labels[i]])
p = model.predict(np.expand_dims(test_seq[i], axis=0))[0]
print(test_seq[i])
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Predicted Emotion: ', pred_class)

Sentence: ive had a few rough days since then and in the midst of crying and dealing and feeling just so defeated and emotional i put my coat on and curled up and created this safety nest inside my coat
Emotion: 0
[  73   57    7  192 3355  149  226  125    4   12    6 3054   10  944
    4 1509    4    8   32   15  694    4  385    2  248   11 2221   30
    4 3748   42    4 1301   23 1762 6160  318   11 2221    0    0    0
    0    0    0    0    0    0    0    0]
Predicted Emotion:  0


In [30]:
sentence = 'i am not sure what to do'
sequence = tokenizer.texts_to_sequences([sentence])
paddedSequence = pad_sequences(sequence, truncating = 'post', padding='post', maxlen=maxlen)
p = model.predict(np.expand_dims(paddedSequence[0], axis=0))[0]
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Sentence:', sentence)
print('Predicted Emotion: ', pred_class)

Sentence: i am not sure what to do
Predicted Emotion:  1
