# Fase 1: Importar las dependencias

In [2]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
try:
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds
    

# Fase 2: Pre Procesado de Datos

In [8]:
cols = ["sentiment", "id", "date", "query", "user", "text"]

train_data = pd.read_csv(
    'data/train.csv'
    , header = None
    , names = cols
    , engine = "python"
    , encoding = "latin1"
)

test_data = pd.read_csv(
    'data/test.csv'
    , header = None
    , names = cols
    , engine = "python"
    , encoding = "latin1"
)


In [9]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
data = train_data

In [12]:
data.drop(["id", "date", "query", "user"]
         , axis = 1
         , inplace = True)

In [15]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Eliminamos la @ y su mención
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Eliminamos los links de las URL
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Nos quedamos solamente con los caracteres
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Eliminamos espacios en blanco adicionales
    tweet = re.sub(r" +", ' ', tweet)
    
    return tweet

In [16]:
data_clean = [clean_tweet(tweet) for tweet in data.text]



In [19]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [25]:
data.text = data_clean

In [26]:
data.head()

Unnamed: 0,sentiment,text
0,0,Awww that's a bummer. You shoulda got David C...
1,0,is upset that he can't update his Facebook by ...
2,0,I dived many times for the ball. Managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,no it's not behaving at all. i'm mad. why am ...


In [30]:
# Va a crear el arreglo de tokens (palabras) y les va a asignar un valor
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)

# Se iteran las sentencias (tweets) y se tokenizan a arreglos para que la NN lo entienda
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [31]:
# Se obtiene el tweet con la mayor longitud de palabras
MAX_LEN = max([len(sentence) for sentence in data_inputs])
# Se hace el padding rellenando todas las frases con la máxima longitud
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs
                                                           , value = 0
                                                           , padding = "post"
                                                           , maxlen = MAX_LEN)

In [34]:
# Los primeros 800k son negativos y los segundos son negativos
# Se obtienen de forma aleatoria primero los 800k y después
# Se le suman 800k para tener los de la segunda parte
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

In [36]:
# Se separan los conjuntos de datos, en labels no se ocupa axis para eliminar porque es un arreglo :)
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis = 0)
train_labels = np.delete(data_labels, test_idx)

# Fase 3: Contrucción del modelo

In [57]:
class DCNN(tf.keras.Model):
    
    def __init__(self
                , vocab_size
                , emb_dim = 128
                , nb_filters = 50
                , FFN_units = 512
                , nb_classes = 2
                , dropout_rate = 0.1
                , training = False
                , name = "dcnn"):
        
        super(DCNN, self).__init__(name = name)
        
        self.embedding = layers.Embedding(vocab_size
                                         , emb_dim)
        
        self.bigram = layers.Conv1D(filters = nb_filters
                                   , kernel_size = 2
                                   , padding = "valid"
                                   , activation = "relu")
        self.trigram = layers.Conv1D(filters = nb_filters
                                    , kernel_size = 3
                                    , padding = "valid"
                                    , activation = "relu")
        self.fourgram = layers.Conv1D(filters = nb_filters
                                    , kernel_size = 4
                                    , padding = "valid"
                                    , activation = "relu")
        
        self.pool = layers.GlobalMaxPooling1D() #No tenemos variable de entrenamiento
                                                #así que podemos usar la misma capa
                                                #para cada paso de pooling
        self.dense_1 = layers.Dense(units = FFN_units, activation = "relu")
        
        self.dropout = layers.Dropout(rate = dropout_rate)
        
        if nb_classes == 2:
            self.last_dense = layers.Dense(units = 1
                                          , activation = "sigmoid")
        else:
            self.last_dense = layers.Dense(units = nb_classes
                                          , activation = "softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis = -1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output
        
        

In [62]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 5
FFN_UNITS = 256
NB_CLASSES = 2 #len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 1

In [58]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE
           , emb_dim = EMB_DIM
           , nb_filters = NB_FILTERS
           , FFN_units = FFN_UNITS
           , nb_classes = NB_CLASSES
           , dropout_rate = DROPOUT_RATE)

In [60]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy"
                , optimizer = "adam"
                , metrics = ["accuracy"])
else:
    Dcnn.compile(loss = "sparse_categorical_crossentropy"
                , optimizer = "adam"
                , metrics = ["sparse_categorical_accuracy"])



In [46]:
checkpoint_path = "ckpt/"

ckpt = tf.train.Checkpoint(Dcnn = Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Ultimo checkpoint restaurado!!!")

In [65]:
Dcnn.fit(train_inputs[:1000]
        , train_labels[:1000]
        , batch_size = BATCH_SIZE
        , epochs = NB_EPOCHS)
ckpt_manager.save()



'ckpt/ckpt-1'

# Fase 4: Aplicación

In [69]:
Dcnn(np.array([tokenizer.encode(" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D'")]), training = False).numpy()

array([[2.2146038e-13]], dtype=float32)

In [68]:
data_clean[:1]

[" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D"]