### 3. Generación de Lenguaje Natural
Implementación de un modelo de generación de lenguaje utilizando algún dataset de los propuestos de Twitter.

El objetivo es el de simular la creación de un bot que escriba tweets de manera similar al del autor utilizado para el entrenamiento.

Se valorará principalmente la argumentación de los pasos realizados y las conclusiones al analizar los resultados y limitaciones encontradas.

In [55]:
# Importamos a lo que vayamos a necesitar

import numpy as np
import pandas as pd
import re
from copy import deepcopy

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

### Cargamos los datos

In [2]:
data = pd.read_csv('./datasets/KimKardashianTweets.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,date,id,link,retweet,text,author
0,0,Oct 2,782694393216110592,/KimKardashian/status/782694393216110592,False,Hey guys! Meet my #KimKardashianGame character...,KimKardashian
1,1,Oct 2,782632831780618240,/KimKardashian/status/782632831780618240,False,This guy is always in my shot!pic.twitter.com/...,KimKardashian
2,2,Oct 2,782559825205071872,/KimKardashian/status/782559825205071872,False,Balenciaga. No make up today.pic.twitter.com/l...,KimKardashian
3,3,Oct 1,782260907900567552,/KimKardashian/status/782260907900567552,False,Kourtney and Kim Take Parispic.twitter.com/VXw...,KimKardashian
4,4,Oct 1,782259209459101696,/KimKardashian/status/782259209459101697,False,GIVENCHY COUTUREpic.twitter.com/jHxTIuHzzo,KimKardashian


In [4]:
data = data[['retweet', 'text', 'author']]

In [5]:
data.head()

Unnamed: 0,retweet,text,author
0,False,Hey guys! Meet my #KimKardashianGame character...,KimKardashian
1,False,This guy is always in my shot!pic.twitter.com/...,KimKardashian
2,False,Balenciaga. No make up today.pic.twitter.com/l...,KimKardashian
3,False,Kourtney and Kim Take Parispic.twitter.com/VXw...,KimKardashian
4,False,GIVENCHY COUTUREpic.twitter.com/jHxTIuHzzo,KimKardashian


### Preprocesado

In [7]:
# Definimos unas regex que emplearemos para eliminar los siguientes campos:
#  - Usuarios/Menciones
#  - Hashtag
#  - Urls
#  - Imagenes

user_regex = re.compile('@\w+| @\w+ ')
hashtag_regex = re.compile('#\w+| #\w+ ')
url_regex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
pics_regex = re.compile('pic?.\w+.\w+\/\w+')

In [45]:
users = []
hashtags = []
urls = []
pics = []


texts = []
for text in data['text']:
    
    this_usrs = user_regex.findall(text)
    this_hashtag = hashtag_regex.findall(text)
    this_url = url_regex.findall(text)
    this_pics = pics_regex.findall(text)
    
    aux = text
    aux = re.sub(pics_regex, '', aux)
    aux = re.sub(url_regex, '', aux) 
    aux = re.sub(user_regex.pattern, ' ', aux)
    aux = re.sub(hashtag_regex.pattern, ' ', aux)
    
    aux = aux.strip()

    texts.append(aux)

In [46]:
# Vemos como se ha quedado nuestro texto procesado
len(texts)

10688

In [47]:
# Combinamos los textos en una unica lista
text = r' '.join(texts)

In [48]:
#  Sacamos los caracteres que componen nuestro corpus
characters = sorted(list(set(text)))
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

print('Unique chars: {}'.format(len(characters)))

Unique chars: 123


In [49]:
print(characters)

['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', '\xad', '´', '¹', 'É', 'á', 'ã', 'é', 'ô', '\u200b', '\u200e', '\u200f', '–', '—', '‘', '’', '“', '”', '•', '…', '™', '♡', '♫', '\ue411', '\ue6b7', '\ue6b9', '\ue6e2', '\ue6e3', '\ue6eb']


In [52]:
def text_conversion(text, ctn_dict, window=5, step=1):
    X = []
    Y = []
    
    for i in range(0, len(text)-window, step):
        sequence = text[i: i + window]
        label = text[i + window]
        
        X.append([ctn_dict[c] for c in sequence])
        Y.append(ctn_dict[label])
        
    return X,Y

In [54]:
X_train, y_train = text_conversion(text, char_to_n)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1)

In [66]:
X_train[0]

[64, 77, 2, 86, 71]

In [62]:
X_modified = np.reshape(X_train, (len(X_train), 5, 1))


In [63]:
print(X_modified[0])

[[64]
 [77]
 [ 2]
 [86]
 [71]]


In [74]:
X_modified.reshape(-1,len(X_modified))[0]

array([0.5203252 , 0.62601626, 0.01626016, ..., 0.01626016, 0.56097561,
       0.64227642])

In [64]:
X_modified = X_modified / float(len(characters))


In [65]:
print(X_modified[0])

[[0.5203252 ]
 [0.62601626]
 [0.01626016]
 [0.69918699]
 [0.57723577]]


In [None]:
Y_modified = np_utils.to_categorical(y_train)

In [75]:
Y_modified[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.])

In [None]:
X_modi

In [36]:
# Model 4
model_4 = Sequential()
model_4.add(LSTM(700, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_4.add(Dropout(0.2))
model_4.add(LSTM(700))
model_4.add(Dropout(0.2))
model_4.add(Dense(Y_modified.shape[1], activation='softmax'))
model_4.compile(loss='categorical_crossentropy', optimizer='adam')

model_4.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 700)          1965600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 700)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 700)               3922800   
_________________________________________________________________
dropout_2 (Dropout)          (None, 700)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 82)                57482     
Total params: 5,945,882
Trainable params: 5,945,882
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model_4.fit(X_modified, Y_modified, validation_split=0.05, batch_size=128, epochs=20, shuffle=True).history