In [2]:
import re
import time
import pandas as pd
import numpy as np
import nltk
from nltk import *
import json

import random
random.seed(10)

from joblib import dump, load

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

import tensorflow as tf


# Ejemplo con Stack de RNNs
from tensorflow.keras.layers import Embedding, SimpleRNN
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.losses import SparseCategoricalCrossentropy

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score #son metricas - area debajo de la curva roc

from sklearn.model_selection import train_test_split


### Lectura del archivo de tweets ingestados y etiquetados

In [3]:
corpus_esA = pd.read_csv('tweets_search_etiquetas.csv',delimiter='\t',encoding='latin-1', dtype = {'id_str': str})
corpus_esA["id_str"] = corpus_esA["id_str"].astype(str).replace(to_replace="[.]", value = "", regex=True)
corpus_esA["id_str"] = corpus_esA["id_str"].astype(str).replace(to_replace="E\+\d{3}", value = "", regex=True)

corpus_esA.head(10)
print(corpus_esA.count())

id_str              361
created_at          361
screen_name         361
text                361
user_followers      361
sentiment           361
funcionality        361
client_attention    361
dtype: int64


### Limpieza en los datos
* Cambiar todas las palabras de mayúsculas a minúsculas
* Se han eliminado las '@' de @USUARIO con el fin de facilitar el etiquetado morfológico
* Quitar los links 
* Quitar los emojis
* Eliminar las stopwords
* Se han reemplazado todos los números por el símbolo '0'
* Quitar los signos de puntuación y quitar espacios (tabuladores, etc)


In [4]:
pattern_URL="(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"

def procesar(file, namefile):    
    file[file.columns[3]] = [clean_text(i) for i in file[file.columns[3]]]    

    sentiment_code_dict = {"000":0, "010":1, "001":2, "110":3, "101":4, "011": 5} 
    file["sentiment_code"] = file["sentiment"].map(str)  + file["funcionality"].map(str)  + file["client_attention"].map(str) 
    file["sentiment_code"] = file["sentiment_code"].apply(lambda x: sentiment_code_dict[x])

    file.to_csv(namefile, sep=';', encoding='latin-1', index=False)
    return file
    
def clean_text(text):
    text = text.lower()   
    #text=re.sub("@([A-Za-z0-9_]{1,15})", "@USUARIO", text)
    text=re.sub("@([A-Za-z0-9_]{1,15})", "", text)
    text=re.sub(pattern_URL, "", text)
    
    text= remove_emoji(text)
    text = re.sub("(\d+)|(rt)|(RT)", "", text)
    text = re.sub("#\w+", "", text)
    text= remove_stopwords(text)
    
    # text=re.sub("\d+", " ", text)
    
    text=re.sub(r" +", " ", re.sub(r"\t", " ", re.sub(r"\n+", "\n", re.sub('(?:[.,\/!$%?¿?!¡\^&\*;:{}=><\-_`~()”“"\'\|])', " ",text))))
    text = text.strip()
    return text

def remove_stopwords(text):    
    nltk.download('stopwords')
    stopwords=set(nltk.corpus.stopwords.words("spanish"))
    for i in stopwords:
        text = re.sub(r"\b%s\b" % i, " ", text)
    return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # emoticons
                               "\U0001F300-\U0001F5FF"  # symbols & pictographs                               
                               "\U0001F680-\U0001F6FF"  # transport & map symbols
                               "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "\U00002702-\U000027B0"
                               "\U000024C2-\U0001F251"
                               "\U0001f926-\U0001f937"
                               "\u200d"
                               "\u2640-\u2642"
                               "\U0001F1F2-\U0001F1F4"  # Macau flag
                               "\U0001F1E6-\U0001F1FF"  # flags
                               "\U0001F600-\U0001F64F"
                               "\U0001F1F2"
                               "\U0001F1F4"
                               "\U0001F620"
                               "]+", flags=re.UNICODE)   
    text = emoji_pattern.sub(r'', text) # no emoji

    return text

In [5]:
corpus_esA = procesar(corpus_esA, "tweets_search_etiquetas_clean.csv")
corpus_esA.tail()
print(corpus_esA.count())
#corpus_esA = pd.read_csv("tweets_search_etiquetas_clean.csv", sep = ";", encoding = "latin-1")
#corpus_esA.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopword

id_str              361
created_at          361
screen_name         361
text                361
user_followers      361
sentiment           361
funcionality        361
client_attention    361
sentiment_code      361
dtype: int64


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\histe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopword

In [6]:
train_idA = corpus_esA[corpus_esA.columns[0]]
X_textA = corpus_esA[corpus_esA.columns[3]].fillna(' ')
#y_hsA = corpus_esA[[corpus_esA.columns[5], corpus_esA.columns[6], corpus_esA.columns[7]]]
y_hsA = corpus_esA[corpus_esA.columns[5]]
y_hsA = y_hsA.values



In [7]:
corpus_train_esA

NameError: name 'corpus_train_esA' is not defined

### Particionamiento de los datos en conjunto de entrenamiento y conjunto de pruebas

In [9]:
X_train_textA, X_test_textA, y_train_hsA, y_test_hsA = train_test_split(X_textA, y_hsA, test_size=0.3, shuffle = True, stratify= y_hsA)


In [10]:
X_train_textA
X_test_textA
y_train_hsA
y_test_hsA

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1],
      dtype=int64)

In [42]:
print(y_train_hsA.shape)

(252,)


In [11]:
 # CountVectorizer creates a vector with lenght equal to the n of unique words in corupus. 1 indicates
# that the word is present
cvectorizer = CountVectorizer(
    # lowercase=True,
    #stop_words=[word.decode('utf-8') for word in nltk.corpus.stopwords.words('spanish')],
    #token_pattern=r'\b\w+\b', #selects tokens of 2 or more alphanumeric characters 
    ngram_range = (3,5),#n-grams de palabras n = 1 a n = 3 (unigramas, bigramas y trigramas)
    min_df = 3,#ignorando los términos que tienen una frecuencia de documento estrictamente inferior a 5
).fit(X_train_textA) # Identifica las palabras unicas y las coloca en un vector

X_train_cvectorized = cvectorizer.transform(X_train_textA).toarray() # para cada palabra unica del corpus, coloca 1 si la palabra aparece en el texto
print(X_train_cvectorized.shape)

X_test_cvectorized = cvectorizer.transform(X_test_textA).toarray()
print(X_test_cvectorized.shape)

dump(cvectorizer, 'count_vectorizer.joblib') # Guarda el vectorizer


(252, 32)
(109, 32)


['count_vectorizer.joblib']

In [12]:
print(len(X_train_cvectorized[0]))
print(X_test_cvectorized.shape)

32
(109, 32)


In [13]:
cvectorizer.vocabulary_

{'huawei mate pro': 15,
 'clientes chiste huawei': 3,
 'chiste huawei primero': 0,
 'huawei primero ofrecen': 17,
 'primero ofrecen cosa': 26,
 'ofrecen cosa después': 23,
 'cosa después echan': 7,
 'después echan atrás': 10,
 'echan atrás nota': 12,
 'clientes chiste huawei primero': 4,
 'chiste huawei primero ofrecen': 1,
 'huawei primero ofrecen cosa': 18,
 'primero ofrecen cosa después': 27,
 'ofrecen cosa después echan': 24,
 'cosa después echan atrás': 8,
 'después echan atrás nota': 11,
 'clientes chiste huawei primero ofrecen': 5,
 'chiste huawei primero ofrecen cosa': 2,
 'huawei primero ofrecen cosa después': 19,
 'primero ofrecen cosa después echan': 28,
 'ofrecen cosa después echan atrás': 25,
 'cosa después echan atrás nota': 9,
 'huawei matepad pro': 16,
 'huawei utn firmaron': 20,
 'utn firmaron convenio': 29,
 'firmaron convenio marco': 13,
 'convenio marco colaboración': 6,
 'huawei utn firmaron convenio': 21,
 'utn firmaron convenio marco': 30,
 'firmaron convenio mar

In [22]:
print(X_test_cvectorized)
print(X_test_cvectorized.shape)
print(len(cvectorizer.vocabulary_))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(109, 59)
59


In [14]:
# Save the vocabulary in json format
with open("cvectorizer_vocabulary_.json", mode = "w", encoding = "latin-1", ) as f:
    f.write(json.dumps(cvectorizer.vocabulary_, indent = 4, ensure_ascii = False))



### Network Model

In [15]:
max_features =  len(cvectorizer.vocabulary_) # or X_train_cvectorized.shape[1] #10000  # tamaño del diccionario de palabras comunes
                      # (número de palabras a utilizar)
maxlen = X_test_cvectorized.shape[1]  # 67         # longitud máxima de cada secuencia de entrenamiento
batch_size = 32

In [16]:
# categorical_crossentropy (cce) produces a one-hot array in the output containing the probable match for each category
# when you have only one class per record. This uses softmax, due to this is a multi class problem [[0, 1, 0], ...].
# sparse_categorical_crossentropy (scce) The vector isn't a one hot, because per record you have one element. [1,1,2 ...], 
# pred [[.2,.3,.4], [.2,.3,.4]... ] and then this produce a category index of the most likely matching category.
# binary_crossentropy: For each label we habe a binary estimation. There you have more than one class per record and more than one class. 
# [[1,1,0]] -> [[.2,.7, .2]] or simple binary this loss function uses sigmod, because is a bniary problem

model = Sequential()
# Capa embedding. Convierte los tokens en vectores densos de dimension fija
# input_dim : size of vocavulary (Cada palabra o token se convirtirá en un vector)
# output_dim: dimensión del vector al que se mapea
# input_length. is the length of tthe tokens by record 
# output_dim is the size of the embedding vector for each word in the vocabulary.
#model.add(Embedding(input_dim = max_features,  output_dim=32, input_length = 15)) # output_dim = 32
model.add(Dense(10, activation = "relu", input_shape = (len(cvectorizer.vocabulary_),) ))
#model.add(SimpleRNN(32, return_sequences=True)) # van despues de embeding
#model.add(SimpleRNN(32))

model.add(Dense(1, activation='sigmoid')) # softmax para multicalse. sigmoid to binary classification

model.summary()

#model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.compile(optimizer='adam', loss = "categorical_crossentropy", metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:

tic = time.time()


history_stackRNN = model.fit(
    X_train_cvectorized, y_train_hsA,
    epochs=15,
    batch_size=batch_size,
    validation_split=0.2,
    verbose=2
)
print('Tiempo de entrenamiento:', time.time()-tic)

# evaluate the model
scores = model.evaluate(X_test_cvectorized, y_test_hsA, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[0], scores[0]*100))

loss, accuracy = model.evaluate(X_train_cvectorized, y_train_hsA, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy)) # accuracy. Exactitud. total de Positives y Falses que son True sobre el total de todo
loss, accuracy = model.evaluate(X_test_cvectorized, y_test_hsA, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
#plot_history(history)

# make predictions
testPredict_stackRNN = model.predict(X_test_cvectorized)
print('\t', 'Accuracy', accuracy_score(y_test_hsA, testPredict_stackRNN.round()))

Epoch 1/15


  return self.fn(y_true, y_pred, **self._fn_kwargs)


7/7 - 1s - 118ms/step - accuracy: 0.5274 - loss: 0.0000e+00 - val_accuracy: 0.6863 - val_loss: 0.0000e+00
Epoch 2/15
7/7 - 0s - 14ms/step - accuracy: 0.5274 - loss: 0.0000e+00 - val_accuracy: 0.6863 - val_loss: 0.0000e+00
Epoch 3/15
7/7 - 0s - 11ms/step - accuracy: 0.5274 - loss: 0.0000e+00 - val_accuracy: 0.6863 - val_loss: 0.0000e+00
Epoch 4/15
7/7 - 0s - 10ms/step - accuracy: 0.5274 - loss: 0.0000e+00 - val_accuracy: 0.6863 - val_loss: 0.0000e+00
Epoch 5/15
7/7 - 0s - 10ms/step - accuracy: 0.5274 - loss: 0.0000e+00 - val_accuracy: 0.6863 - val_loss: 0.0000e+00
Epoch 6/15
7/7 - 0s - 10ms/step - accuracy: 0.5274 - loss: 0.0000e+00 - val_accuracy: 0.6863 - val_loss: 0.0000e+00
Epoch 7/15
7/7 - 0s - 10ms/step - accuracy: 0.5274 - loss: 0.0000e+00 - val_accuracy: 0.6863 - val_loss: 0.0000e+00
Epoch 8/15
7/7 - 0s - 10ms/step - accuracy: 0.5373 - loss: 0.0000e+00 - val_accuracy: 0.7059 - val_loss: 0.0000e+00
Epoch 9/15
7/7 - 0s - 10ms/step - accuracy: 0.5473 - loss: 0.0000e+00 - val_accura

In [18]:
testPredict_stackRNN

array([[0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.27792835],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.45076093],
       [0.450

In [19]:
model.predict(cvectorizer.transform(["me encanta huawei, es el mejor"]).toarray())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


array([[0.45076093]], dtype=float32)

### Embeding
Espera un vector de enteros donde cada palabra es un numero. Por lo que no puede usarse Countvectorizer que devuelve un arreglo de one hot encoding.

input_dim: Tamaño del vocabulario (número de palabras únicas).
output_dim: Dimensión de los vectores densos que representan las palabras.
input_length: Longitud de las secuencias de entrada.


La salida de la capa Embedding tendrá una forma de (batch_size, maxlen, output_dim), donde:
batch_size: Número de muestras en el lote.
maxlen: Longitud de las secuencias (definida por pad_sequences).
output_dim: Dimensión de los vectores densos (definida por Embedding).

In [25]:
num_words = len(cvectorizer.vocabulary_) # using in tokenizer
input_dim =  len(cvectorizer.vocabulary_) # using in embeding

output_dim = 32
maxlen = 32 # max length of input seqs
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .7

In [26]:
len(cvectorizer.vocabulary_)

32

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=num_words, oov_token=oov_tok)
tokenizer.fit_on_texts(X_textA)

word_index = tokenizer.word_index

def text_to_seq_token_and_padd(x_text_vector: np.ndarray) -> np.ndarray:
    train_sequences = tokenizer.texts_to_sequences(x_text_vector)
    train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding=padding_type, truncating=trunc_type)
    return train_padded

X_train_cvectorized: np.ndarray = text_to_seq_token_and_padd(X_train_textA)
X_test_cvectorized: np.ndarray = text_to_seq_token_and_padd(X_test_textA)

X_train_cvectorized

array([[ 1,  1, 18, ...,  0,  0,  0],
       [ 1,  1,  2, ...,  0,  0,  0],
       [ 1,  2, 30, ...,  0,  0,  0],
       ...,
       [ 1,  1,  2, ...,  0,  0,  0],
       [ 1,  1, 18, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0]])

In [28]:
embedding_dim = batch_size # 50 #batch_size
#maxlen = 15

model = Sequential()
model.add(Embedding(input_dim = input_dim, 
                           output_dim = output_dim, 
                           input_length = maxlen))
model.add(Flatten())
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train_cvectorized, y_train_hsA,
                    epochs=15,
                    verbose=2,
                    validation_data=(X_test_cvectorized, y_test_hsA),
                    batch_size=batch_size)
                    
loss, accuracy = model.evaluate(X_train_cvectorized, y_train_hsA, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_cvectorized, y_test_hsA, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
#plot_history(history)

# make predictions
testPredict_stackRNN = model.predict(X_test_cvectorized)
print('\t', 'Accuracy', accuracy_score(y_test_hsA, testPredict_stackRNN.round()))





Epoch 1/15


  return self.fn(y_true, y_pred, **self._fn_kwargs)


8/8 - 1s - 93ms/step - accuracy: 0.6111 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 2/15
8/8 - 0s - 10ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 3/15
8/8 - 0s - 9ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 4/15
8/8 - 0s - 9ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 5/15
8/8 - 0s - 10ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 6/15
8/8 - 0s - 9ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 7/15
8/8 - 0s - 9ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 8/15
8/8 - 0s - 9ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5872 - val_loss: 0.0000e+00
Epoch 9/15
8/8 - 0s - 9ms/step - accuracy: 0.5913 - loss: 0.0000e+00 - val_accuracy: 0.5

In [29]:
testPredict_stackRNN

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],

In [30]:
model.predict(cvectorizer.transform(["me encanta huawei, es el mejor"]).toarray())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step


array([[0.]], dtype=float32)

In [38]:

lstm_out = 98

model = Sequential()
model.add(Embedding(input_dim = input_dim, 
                           output_dim = output_dim, 
                           input_length = maxlen))
#model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

batch_size = 32
model.fit(X_train_cvectorized, y_train_hsA, epochs = 20, batch_size=batch_size, verbose = 2)

loss, accuracy = model.evaluate(X_train_cvectorized, y_train_hsA, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_cvectorized, y_test_hsA, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
#plot_history(history)

# make predictions
testPredict_stackRNN = model.predict(X_test_cvectorized)
print('\t', 'Accuracy', accuracy_score(y_test_hsA, testPredict_stackRNN.round()))



None
Epoch 1/20
8/8 - 2s - 266ms/step - accuracy: 0.5595 - loss: 0.6857
Epoch 2/20
8/8 - 0s - 13ms/step - accuracy: 0.5913 - loss: 0.6822
Epoch 3/20
8/8 - 0s - 11ms/step - accuracy: 0.5913 - loss: 0.6757
Epoch 4/20
8/8 - 0s - 12ms/step - accuracy: 0.5913 - loss: 0.6763
Epoch 5/20
8/8 - 0s - 13ms/step - accuracy: 0.5913 - loss: 0.6696
Epoch 6/20
8/8 - 0s - 12ms/step - accuracy: 0.5913 - loss: 0.6675
Epoch 7/20
8/8 - 0s - 12ms/step - accuracy: 0.5754 - loss: 0.6702
Epoch 8/20
8/8 - 0s - 13ms/step - accuracy: 0.5952 - loss: 0.6697
Epoch 9/20
8/8 - 0s - 13ms/step - accuracy: 0.5913 - loss: 0.6578
Epoch 10/20
8/8 - 0s - 13ms/step - accuracy: 0.5992 - loss: 0.6608
Epoch 11/20
8/8 - 0s - 13ms/step - accuracy: 0.5754 - loss: 0.6463
Epoch 12/20
8/8 - 0s - 12ms/step - accuracy: 0.6032 - loss: 0.6502
Epoch 13/20
8/8 - 0s - 12ms/step - accuracy: 0.5873 - loss: 0.6411
Epoch 14/20
8/8 - 0s - 12ms/step - accuracy: 0.6151 - loss: 0.6506
Epoch 15/20
8/8 - 0s - 13ms/step - accuracy: 0.6190 - loss: 0.634

In [52]:
for i,v in enumerate(X_test_textA):
    if i == 4:
        break
    else:
        print(v)

compatibilidad huawei apps usar dos apps ver tarjeta checar saldo lentitud extrema ir banco hacer muchas cosas podrían hacerse múltiples apps
hola cualquier huawei lanzado mayo servicios google trae tienda apps google play sugería publicar app huawei appgallery requiere adaptación mínima ht
nuevo huawei mate xs móvil plegable avanzado huawei ligeros
pensé tener huawei estaria limitado verdad lograddo tener mismas funcionalidades android


In [56]:
y_test_hsA

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1],
      dtype=int64)

In [39]:
testPredict_stackRNN

array([[0.09967058],
       [0.11590619],
       [0.48478466],
       [0.4991713 ],
       [0.1126158 ],
       [0.25048202],
       [0.5568909 ],
       [0.5643294 ],
       [0.26827866],
       [0.556471  ],
       [0.56264555],
       [0.54503393],
       [0.10545922],
       [0.54041064],
       [0.5586862 ],
       [0.5295797 ],
       [0.27479953],
       [0.5527955 ],
       [0.55645126],
       [0.4116408 ],
       [0.22384775],
       [0.5356884 ],
       [0.52343124],
       [0.23929194],
       [0.39537987],
       [0.56519043],
       [0.10260776],
       [0.51025045],
       [0.10773961],
       [0.13730226],
       [0.56167483],
       [0.35378549],
       [0.51726866],
       [0.5518445 ],
       [0.10062805],
       [0.49601847],
       [0.27479953],
       [0.12045305],
       [0.52398205],
       [0.12525266],
       [0.13820021],
       [0.45257148],
       [0.555383  ],
       [0.10585988],
       [0.43011633],
       [0.4226626 ],
       [0.5549711 ],
       [0.560

In [54]:
model.predict(cvectorizer.transform(["pensé tener huawei estaria limitado verdad lograddo tener mismas funcionalidades android"]).toarray())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


array([[0.56542754]], dtype=float32)

In [13]:
model.save('best_keras_network_model.h5')

In [11]:
input_list = list(map(lambda x: str(x), y_train_hsA.round()))
c = Counter( input_list )

print( c.items() )

dict_items([('[0 0 0]', 105), ('[0 1 0]', 41), ('[1 1 0]', 45), ('[0 0 1]', 14), ('[1 0 1]', 1)])


In [20]:
corpus_dev_esA["predict_test"] = list(map(lambda x: str(x), testPredict_stackRNN.round()))

corpus_dev_esA.to_csv("tweets_search_etiquetas_clean_test_predict.csv", sep="\t", encoding = "latin-1")

In [21]:
list(map(lambda x: str(x), testPredict_stackRNN.round()))

['[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 0. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0.]',
 '[0. 1. 0

In [7]:
from dash import Dash, html, dcc
import plotly.express as px
from dash.dependencies import Input, Output
import pandas as pd

app = Dash(__name__)

# assume you have a "long-form" data frame
# see https://plotly.com/python/px-arguments/ for more options
"""
df = pd.DataFrame({
    "Ambito": ["funcionamiento", "atencion_cliente", "atencion_cliente", "funcionamiento", "funcionamiento", "funcionamiento"],
    "Conteo": [4, 1, 2, 2, 4, 5],
    "Sentimiento": ["Positivo", "Negativo", "Negativo", "Negativo", "Negativo", "Positivo"]
})
"""


app.layout = html.Div(children=[
    html.H1(children='Hello Dash'),

    html.Div(children='''
        Analisis de sentimiento
    '''),

    dcc.Graph(
        id='live-update-graph'#,
        #figure=fig
    ),
    dcc.Interval(
            id='interval-component',
            interval=8*1000, # in milliseconds
            n_intervals=0
    )
])


# Multiple components can update everytime interval gets fired.
@app.callback(Output('live-update-graph', 'figure'),
              Input('interval-component', 'n_intervals'))
def update_graph_live(n):

    df = pd.read_csv("stream_tweet_groupby.csv", sep = ",", encoding= "latin-1", dtype = {'sentiment_code': str})

    df["sentiment_bin"] = df["sentiment_code"].apply(lambda x: str(x[0]))
    sentiment_dict = {"0": "negativo", "1": "positivo"}
    df["sentiment_bin"] = df["sentiment_bin"].apply(lambda x: sentiment_dict[x])

    fig = px.bar(df, x="sentiment_code", y="count", color ="sentiment_bin", barmode="group")
    #fig = px.bar(df, x="Fruit", y="Amount", color="City", barmode="group")

    return fig


if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


In [1]:

from search_or_simulate import SearchOrSimulate
from api_twitter import StreammerTwitter


streammer = SearchOrSimulate()
#streammer.simulate_search_streaming("tweets_search_2.txt", True, model_file = "best_keras_network_model.h5", cvectorizer_file = "count_vectorizer.joblib")
#streammer.search_streaming("streaming_tweets_search.txt", False, model_file = "best_keras_network_model.h5", cvectorizer_file = "count_vectorizer.joblib")
streammer.search_30_days_ago()
#transformed_tweet



ModuleNotFoundError: No module named 'search_or_simulate'