# <center>Organizaci&oacute;n de Datos</center>
#### <center>C&aacute;tedra Ing. Rodriguez, Juan Manuel </center>
## <center>Trabajo Práctico 2 : Críticas Cinematográficas </center>
#### Grupo 29:
* Alen Davies Leccese - 107084
* Luca Lazcano - 107044

## Imports y carga de datos

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import sklearn as sk
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, classification_report, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB

import xgboost as xgb
from xgboost import XGBClassifier
import pickle
import re

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow import keras
from keras.utils.vis_utils import plot_model
from keras.callbacks import EarlyStopping

from collections import Counter

RANDOM_STATE = 42

In [3]:
train = pd.read_csv('Datasets/train.csv')
test = pd.read_csv('Datasets/test.csv')

In [4]:
shape = train.shape
print("Filas: %d Columnas: %d" % (shape[0], shape[1]))

Filas: 50000 Columnas: 3


In [None]:
train.head()

## Preprocesamiento

Comenzamos por convertir a tipo str y a minúscula la columna ``'review_es'``

In [None]:
test['review_es'] = test['review_es'].astype(str).str.lower()
train['review_es'] = train['review_es'].astype(str).str.lower()
train.head()

### Tokenización

Usamos RegexpTokenizer de nltk para splitear las reviews en palabras. Usamos la expresión regular ``\w+`` que representa una expresión regular que coincide con una o más letras, números o guiones bajos.

In [None]:
regexp = RegexpTokenizer('\w+')

test['review_token'] = test['review_es'].apply(regexp.tokenize)
train['review_token'] = train['review_es'].apply(regexp.tokenize)
train.head()

### Stopwords

Las stopwords (palabras vacías), son palabras muy comunes en un idioma que generalmente no aportan un significado importante al análisis de texto.

Las stopwords incluyen palabras como artículos ("el", "la", "los", "las"), pronombres ("yo", "tú", "él", "ella"), preposiciones ("a", "de", "en", "con").

Las stopwords se eliminan para reducir el ruido y el tamaño del vocabulario en el texto analizado, centrándose en las palabras clave. Al eliminar estas palabras vacías, se puede mejorar el rendimiento de algoritmos de procesamiento de texto.

In [None]:
nltk.download('stopwords')

In [None]:
stopwords_es = nltk.corpus.stopwords.words("spanish")
print(stopwords_es)

In [None]:
test['review_token'] = test['review_token'].apply(lambda x: [item for item in x if item not in stopwords_es])
train['review_token'] = train['review_token'].apply(lambda x: [item for item in x if item not in stopwords_es])
train.head()

In [None]:
stopwords_en = nltk.corpus.stopwords.words("english")
print(stopwords_en)

In [None]:
test['review_token'] = test['review_token'].apply(lambda x: [item for item in x if item not in stopwords_en])
train['review_token'] = train['review_token'].apply(lambda x: [item for item in x if item not in stopwords_en])
train.head()

### Filtrado por largo del palabras

Ahora vamos a eliminar palabras poco frecuentes manteniendo solo las palabras que tienen más de 3 letras.

In [None]:
test['review_token'] = test['review_token'].apply(lambda x: [item for item in x if len(item) > 3])
train['review_token'] = train['review_token'].apply(lambda x: [item for item in x if len(item) > 3])
train.head()

### Igualar palabras con o sin acento

In [None]:
# Sacar todas las tildes
def remove_tildes(text):
    tildes = {
        'á': 'a',
        'é': 'e',
        'í': 'i',
        'ó': 'o',
        'ú': 'u'
    }
    for key, value in tildes.items():
        text = text.replace(key, value)
    return text

In [None]:
test['review_token'] = test['review_token'].apply(lambda x: [remove_tildes(item) for item in x])
train['review_token'] = train['review_token'].apply(lambda x: [remove_tildes(item) for item in x])
train.head()

### Creación de vocabulario

TODO: explicar esto

In [None]:
vocabulario = Counter()

for review in train['review_token']:
    for palabra in review:
        vocabulario[palabra] += 1


In [None]:
len(vocabulario)

In [None]:
vocab_size = 10000
vocabulario_truncado = [ word for word, count in vocabulario.most_common()[:vocab_size] ]

In [None]:
len(vocabulario_truncado)

### Grafico las 20 palabras más frecuentes

In [None]:
# Graficar las palabras más frecuentes con vocabulario_truncado.most_common(20)

top20 = pd.Series(dict(vocabulario.most_common(20)))
sns.barplot(y=top20.index, x=top20.values).set(title='Top 20 palabras más frecuentes', xlabel='Frecuencia', ylabel='Palabra')

plt.show()

### Filtrado de las que no están en el vocabulario truncado

In [None]:
test['review_filtrado'] = test['review_token'].apply(lambda x: [item for item in x if item in vocabulario_truncado])
train['review_filtrado'] = train['review_token'].apply(lambda x: [item for item in x if item in vocabulario_truncado])
train.head()

### Truncado de las reviews

Acorto las reviews a las 300 primeras palabras.

In [None]:
# guardo las 300 primeras palabras de cada review
test['review_filtrado_300'] = test['review_filtrado'].apply(lambda x: x[:300])
train['review_filtrado_300'] = train['review_filtrado'].apply(lambda x: x[:300])
train.head()

### Unión de review filtrado en string

In [None]:
# join all words of each review into a string
test['review_filtrado_string'] = test['review_filtrado_300'].apply(lambda x: ' '.join(x))
train['review_filtrado_string'] = train['review_filtrado_300'].apply(lambda x: ' '.join(x))
train.head()

### Bag of Words

La técnica Bag of Words es una forma de representar datos de texto en aprendizaje automático. El concepto básico es tratar cada documento de texto como una "bolsa" de palabras, donde se ignora el orden y la estructura gramatical de las palabras. En lugar de eso, se enfoca únicamente en la presencia y frecuencia de las palabras en el texto.

Vamos a utilizar ``TfidfVectorizer`` de scikit-learn que implementa la técnica de ponderación TF-IDF (Term Frequency-Inverse Document Frequency) para convertir datos de texto en características numéricas.

In [None]:
vectorizer = TfidfVectorizer()
train_fid = vectorizer.fit_transform(train.review_filtrado_string)
test_fid = vectorizer.transform(test.review_filtrado_string)

In [None]:
print(train_fid)

In [None]:
print(test_fid)

## Split train/test

Dvidimos el dataset en train y test.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_fid,
                                                    train.sentimiento, 
                                                    test_size=0.3,
                                                    random_state=RANDOM_STATE, shuffle=True)

## Bayes Naive

Realizamos optimización de hiperparámetros con Random Search.

In [None]:
#Cantidad de combinaciones que quiero probar
n=15

#Grilla de Parámetros a probar
params_grid={ 'alpha': [0.5, 1.0, 2.0, 5.0],
               'class_prior': [None, [0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.5, 0.5], [0.6, 0.4]],
               'fit_prior': [True, False],
               'force_alpha': [True, False]
             }
                
#Cantidad de splits para el Cross Validation
folds=5

#Kfold estratificado
kfoldcv = StratifiedKFold(n_splits=folds, shuffle=True)

#Clasificador
NB_model = MultinomialNB()

# Metrica que quiero optimizar F1 Score
scorer_fn = make_scorer(sk.metrics.f1_score, pos_label='positivo')

#Random Search Cross Validation
rand_NB = RandomizedSearchCV(estimator = NB_model,
                              param_distributions = params_grid,
                              scoring = scorer_fn,
                              cv=kfoldcv,
                              n_iter = n,
                              random_state = RANDOM_STATE) 

rand_NB.fit(x_train, y_train)

Vemos cuales fueron los mejores hiperparámetros y la mejor métrica obtenida.

In [None]:
print('Mejores parámetros: {}'.format(rand_NB.best_params_))
print('Mejor métrica: {}'.format(rand_NB.best_score_))

Entrenamos el modelo de Bayes Naive con los mejores hiperparámetros y realizamos las predicciones.

In [None]:
NB_model = MultinomialNB().set_params(**rand_NB.best_params_)
NB_model.fit(x_train, y_train)

y_pred = NB_model.predict(x_test)

In [None]:
print(f'Best Training Accuracy: {NB_model.score(x_train, y_train)}')
print(f'Best Testing Accuracy: {NB_model.score(x_test, y_test)}')

### Métricas

Calculamos las métricas de accuracy, precision, recall y f1-score.

In [None]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, pos_label='positivo')
f1 = f1_score(y_test, y_pred, pos_label='positivo')
precision = precision_score(y_test, y_pred, pos_label='positivo')

print("Accuracy: "+str(accuracy))
print("Recall: "+str(recall))
print("Precision: "+str(precision))
print("f1 score: "+str(f1))

In [None]:
#Reporte de Clasificación
print(classification_report(y_test, y_pred))

In [None]:
#Matriz de Confusión
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm, cmap='Blues',annot=True,fmt='g').set(title='Matriz de Confusión', xlabel='Predicted', ylabel='True', xticklabels=['negativo', 'positivo'], yticklabels=['negativo', 'positivo'])
plt.show()

### Submission

In [None]:
pickle.dump(NB_model, open('NB_grupo29.pickle', 'wb'))

In [None]:
X_submission_NB = test_fid
y_pred_test_NB = NB_model.predict(X_submission_NB)
df_submission_NB = pd.DataFrame({'ID': test['ID'], 'sentimiento': y_pred_test_NB})
df_submission_NB.to_csv('NB03.csv', index=False)

In [None]:
label_map = {'negativo': 0, 'positivo': 1}
y_train = y_train.map(label_map)
y_test  = y_test.map(label_map)

In [None]:
y_train.head()

In [None]:
y_test.head()

## Random Forest

In [None]:
#Cantidad de combinaciones que quiero probar
n=5

#Grilla de Parámetros
params_grid={ 'n_estimators': [100, 200, 300, 400, 500],
                'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 35, 40],
                'max_depth': [None],
                'max_samples': [None, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0],
                'ccp_alpha': [0.0, 0.001, 0.01, 0.1, 1.0],
             }

#Clasificador
rf_model = RandomForestClassifier(random_state=RANDOM_STATE)

# Metrica que quiero optimizar F1 Score
scorer_fn = make_scorer(sk.metrics.f1_score)

#Random Search Cross Validation
rand_rf = RandomizedSearchCV(estimator = rf_model,
                              param_distributions = params_grid,
                              scoring = scorer_fn,
                              n_iter = n,
                              random_state=RANDOM_STATE) 

rand_rf.fit(x_train, y_train)
rand_rf.cv_results_['mean_test_score'] 

In [None]:
print('Mejores parámetros: {}'.format(rand_rf.best_params_))
print('Mejor métrica: {}'.format(rand_rf.best_score_))

In [None]:
rf_model = RandomForestClassifier(random_state=RANDOM_STATE).set_params(**rand_rf.best_params_)
rf_model.fit(x_train, y_train)

y_pred = rf_model.predict(x_test)

In [None]:
print(f'Best Training Accuracy: {rf_model.score(x_train, y_train)}')
print(f'Best Testing Accuracy: {rf_model.score(x_test, y_test)}')

### Métricas

In [None]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, pos_label='positivo')
f1 = f1_score(y_test, y_pred, pos_label='positivo')
precision = precision_score(y_test, y_pred, pos_label='positivo')

print("Accuracy: "+str(accuracy))
print("Recall: "+str(recall))
print("Precision: "+str(precision))
print("f1 score: "+str(f1))

In [None]:
#Reporte de Clasificación
print(classification_report(y_test, y_pred))

In [None]:
#Matriz de Confusión
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm, cmap='Blues',annot=True,fmt='g').set(title='Matriz de Confusión', xlabel='Predicted', ylabel='True', xticklabels=['negativo', 'positivo'], yticklabels=['negativo', 'positivo'])
plt.show()

### Submission

In [None]:
pickle.dump(rf_model, open('rf_grupo29.pickle', 'wb'))

In [None]:
X_submission_rf = test_fid
y_pred_test_rf = rf_model.predict(X_submission_rf)
y_pred_test_rf
df_submission_rf = pd.DataFrame({'ID': test['ID'], 'sentimiento': y_pred_test_rf})
df_submission_rf.to_csv('rf01.csv', index=False)

## XGBoost

In [None]:
label_map = {'negativo': 0, 'positivo': 1}
y_train_mapped = y_train.map(label_map)
y_test_mapped = y_test.map(label_map)

In [None]:
#Cantidad de combinaciones que quiero probar
n=5

#Grilla de Parámetros
params_grid= {'learning_rate': np.linspace(0.05, 0.5, 50),
                'gamma': [0,1,2],
                'max_depth': list(range(2,10)),
                'subsample': np.linspace(0, 1, 20),
                'lambda': [0,1,2],
                'alpha' : [1],
                'n_estimators': list(range(10,161,10))
              }

#Clasificador
xgb_model = XGBClassifier(random_state=RANDOM_STATE)

# Metrica que quiero optimizar F1 Score
scorer_fn = make_scorer(sk.metrics.f1_score)

#Random Search Cross Validation
rand_xgb = RandomizedSearchCV(estimator = xgb_model,
                              param_distributions = params_grid,
                              scoring = scorer_fn,
                              n_iter = n,
                              random_state=RANDOM_STATE) 

rand_xgb.fit(x_train, y_train_mapped)
rand_xgb.cv_results_['mean_test_score'] 

In [None]:
print('Mejores parámetros: {}'.format(rand_xgb.best_params_))
print('Mejor métrica: {}'.format(rand_xgb.best_score_))

In [None]:
xgb_model = XGBClassifier(random_state=RANDOM_STATE).set_params(**rand_xgb.best_params_)
xgb_model.fit(x_train, y_train)

y_pred = xgb_model.predict(x_test)

In [None]:
print(f'Best Training Accuracy: {xgb_model.score(x_train, y_train)}')
print(f'Best Testing Accuracy: {xgb_model.score(x_test, y_test)}')

### Métricas

In [None]:
accuracy = accuracy_score(y_test_mapped, y_pred)
recall = recall_score(y_test_mapped, y_pred)
f1 = f1_score(y_test_mapped, y_pred)
precision = precision_score(y_test_mapped, y_pred)

print("Accuracy: "+str(accuracy))
print("Recall: "+str(recall))
print("Precision: "+str(precision))
print("f1 score: "+str(f1))

In [None]:
#Reporte de Clasificación
print(classification_report(y_test_mapped, y_pred))

In [None]:
#Matriz de Confusión
cm = confusion_matrix(y_test_mapped, y_pred)
sns.heatmap(cm, cmap='Blues',annot=True,fmt='g').set(title='Matriz de Confusión', xlabel='Predicted', ylabel='True', xticklabels=['negativo', 'positivo'], yticklabels=['negativo', 'positivo'])
plt.show()

### Submission

In [None]:
pickle.dump(xgb_model, open('xgb_grupo29.pickle', 'wb'))

In [None]:
X_submission_xgb = test_fid
y_pred_test_xgb = xgb_model.predict(X_submission_xgb)

label_map = {0: 'negativo', 1: 'positivo'}
y_pred_test_xgb = y_pred_test_xgb.map(label_map)

df_submission_xgb = pd.DataFrame({'ID': test['ID'], 'sentimiento': y_pred_test_xgb})
df_submission_xgb.to_csv('xgb00.csv', index=False)

## Red Neuronal

### Embedding

El tokenizer hace todo el preproeso que ya hicimos, entones lo entrenamos con la review cruda, seteando acá el tamaño del vocabulario, etc.

In [105]:
VOCAB_SIZE = 1000
REVIEW_MAX_LENGTH = 20

In [106]:
# Tokenize the words in the training set
tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train['review_es'])

Cantidad de palabras únicas.

Vemos algunas palabras

In [42]:
tokenizer.sequences_to_texts([[200, 5000]])

['punto humorístico']

Y los indices de un string:

In [43]:
tokenizer.texts_to_sequences(['la película es malarda'])

[[2, 12, 9]]

In [107]:
max_id = len(tokenizer.word_index)
print(max_id)

192669


In [108]:
# convert numpy array to regular array
reviews_lista_train = pd.Series(train['review_es']).to_numpy()

In [109]:
print(reviews_lista_train[:3])

['Uno de los otros críticos ha mencionado que después de ver solo 1 Oz Episodio, estará enganchado. Tienen razón, ya que esto es exactamente lo que sucedió conmigo. La primera cosa que me golpeó sobre Oz fue su brutalidad y sus escenas de violencia inconfiadas, que se encuentran a la derecha de la palabra. Confía en mí, este no es un espectáculo para los débiles de corazón o tímido. Este espectáculo no extrae punzones con respecto a las drogas, el sexo o la violencia. Es Hardcore, en el uso clásico de la palabra. Se llama OZ, ya que es el apodo dado al Penitenciario del Estado de Seguridad Máximo de Oswald. Se centra principalmente en la ciudad de Emeralda, una sección experimental de la prisión donde todas las células tienen frentes de vidrio y se enfrentan hacia adentro, por lo que la privacidad no es alta en la agenda. Em City es el hogar de muchos ... Fariarios, musulmanes, gangstas, latinos, cristianos, italianos, irlandeses y más ... así que las esposas, las miradas de muerte, la

In [110]:
encoded_train = tokenizer.texts_to_sequences(reviews_lista_train)

In [123]:
encoded_train_truncado = tf.keras.preprocessing.sequence.pad_sequences(encoded_train, maxlen=REVIEW_MAX_LENGTH, truncating='post', padding='post', value=0)

In [124]:
print(encoded_train_truncado[:5])

[[ 80   1  11 182  70   3  77   1  46  35 276 395 201 249  74   3  39   9
  605  16]
 [ 10 379 379 349   2   1   9  40  40   2   1   2   4  58 258  10 648   1
    4   7]
 [292   3  21  48  10 118 825   1 621  73   5   8 659   1 931   1 131   5
    6 617]
 [724  51  10 267 125   8 226 417   3  51   8   5  19   4  38 684  99  52
    6  73]
 [  6 211   5   6  73   1   9  10  12 723  15  46   6 489 212 852   8  54
   17 870]]


In [37]:
# encoded_train_ragged_tensor = tf.ragged.stack([tf.convert_to_tensor(arr) for arr in encoded_train], axis=0)

In [38]:
# encoded_train_dense_tensor = encoded_train_ragged_tensor.to_tensor()
# encoded_train_dense_tensor

<tf.Tensor: shape=(50000, 1798), dtype=int32, numpy=
array([[  80,    1,   11, ...,    0,    0,    0],
       [  10,  379,  379, ...,    0,    0,    0],
       [ 292,    3,   21, ...,    0,    0,    0],
       ...,
       [ 320, 6286,    5, ...,    0,    0,    0],
       [ 973,    7,  190, ...,    0,    0,    0],
       [ 315,  998,    3, ...,    0,    0,    0]])>

In [36]:
# encoded_train_tensor = tf.convert_to_tensor(encoded_train, dtype=tf.float32)

ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [80]:
# np.array(encoded_train).flatten()

  np.array(encoded_train).flatten()


array([list([80, 1, 11, 182, 1238, 70, 3180, 3, 77, 1, 46, 35, 276, 3713, 395, 2459, 6048, 201, 249, 74, 3, 39, 9, 605, 16, 3, 1009, 2279, 2, 145, 217, 3, 29, 2361, 54, 3713, 31, 19, 6605, 4, 38, 116, 1, 568, 3, 14, 703, 7, 2, 2153, 1, 2, 708, 3974, 5, 235, 33, 13, 9, 8, 228, 15, 11, 2404, 1, 447, 32, 6049, 33, 228, 13, 18, 791, 7, 17, 895, 6, 484, 32, 2, 568, 9, 4535, 5, 6, 677, 579, 1, 2, 708, 14, 1157, 3713, 74, 3, 9, 6, 607, 27, 25, 242, 1, 1454, 2663, 1, 14, 2904, 789, 5, 2, 288, 1, 10, 2908, 6144, 1, 2, 1620, 125, 169, 17, 7389, 201, 1, 5695, 4, 14, 3631, 341, 20, 16, 3, 2, 13, 9, 652, 5, 2, 5495, 2571, 9, 6, 1300, 1, 185, 8152, 4399, 4333, 9429, 4, 24, 92, 3, 17, 4973, 17, 5326, 1, 310, 17, 870, 9876, 4, 11, 94, 99, 563, 293, 1614, 6, 250, 1395, 1, 6, 228, 14, 170, 27, 81, 1, 3, 298, 125, 182, 1433, 13, 14, 5982, 1, 17, 380, 3992, 15, 17, 1309, 8209, 6, 1477, 6, 932, 3713, 13, 14, 6, 316, 395, 3, 59, 89, 29, 885, 44, 1178, 3, 31, 2934, 13, 469, 115, 3, 72, 2691, 15, 809, 23, 41,

### Split train / test

In [125]:
x_train_rnn, x_test_rnn, y_train_rnn, y_test_rnn = train_test_split(encoded_train_truncado,
                                                                    train.sentimiento,
                                                                    test_size=0.3,
                                                                    random_state=RANDOM_STATE, shuffle=True)

In [126]:
label_map = {'negativo': 0, 'positivo': 1}
y_train_rnn = y_train_rnn.map(label_map)
y_test_rnn  = y_test_rnn.map(label_map)

In [127]:
x_train_rnn_ragged = tf.ragged.stack([tf.convert_to_tensor(lista) for lista in x_train_rnn], axis=0)
x_train_rnn_dense = x_train_rnn_ragged.to_tensor()
x_train_rnn_dense

<tf.Tensor: shape=(35000, 20), dtype=int32, numpy=
array([[177,  22,  29, ...,   7,  37,   7],
       [ 33,  31,   8, ...,   7, 621,  20],
       [ 13, 329,  17, ...,   4,  13, 135],
       ...,
       [ 39,   9,  10, ..., 210, 831,  52],
       [ 21, 349,  31, ...,  72, 149,  11],
       [ 21,   9,  10, ..., 157,   1,  73]])>

In [31]:
# x_train_rnn_np = np.asarray(x_train_rnn, dtype=object)
# x_train_rnn_np

array([list([177, 22, 29, 3216, 11, 5813, 13, 913, 21, 12, 2, 887, 1, 3, 14, 97, 2204, 10, 4, 3471, 1, 7, 3896, 37, 2285, 7, 164, 1303, 7, 16, 339, 25, 343, 45, 3, 221, 7, 2, 2016, 1, 17, 398, 1, 264, 8049, 90, 5891, 1, 2, 3, 6858, 6, 3, 7, 11, 1, 8, 337, 7, 164, 4, 7, 330, 761, 4, 7542, 2, 251, 147, 3, 588, 52, 65, 9, 8, 194, 624, 28, 63, 5, 19, 1, 4, 444, 8, 194, 643, 22, 19, 611, 1740, 4, 60, 1966, 9, 3106, 22, 6, 392, 4, 6, 417, 392, 390, 7, 43, 918, 164, 1048]),
       list([33, 31, 8, 40, 194, 23, 22, 3421, 409, 2798, 78, 77, 2, 6325, 1230, 6810, 100, 1, 11, 6810, 255, 3991, 13, 973, 7, 621, 20, 124, 1608, 5, 2, 5480, 79, 8067, 15, 156, 37, 105, 11, 535, 1025, 24, 1618, 6, 1291, 2044, 15, 2, 3, 2145, 11, 25, 611, 1, 1, 4, 6193, 26, 1796, 6, 316, 1608, 1, 18, 4, 2066, 1957, 4, 6, 5033, 4, 6, 316, 3275, 130, 6, 561, 5762, 4, 6874, 6302, 100, 6810, 234, 10, 649, 897, 1, 73, 27, 445, 3, 11, 1957, 2, 4860, 72, 7, 35, 409, 1406, 1574, 483, 1957, 4006, 48, 4, 1165, 1957, 48, 131, 1041, 

In [25]:
# x_train_rnn_np = tf.convert_to_tensor(x_train_rnn_np)
# print(x_train_rnn_np)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [18]:
x_train_rnn_tensor

array([list([177, 22, 29, 3216, 11, 5813, 13, 913, 21, 12, 2, 887, 1, 3, 14, 97, 2204, 10, 4, 3471, 1, 7, 3896, 37, 2285, 7, 164, 1303, 7, 16, 339, 25, 343, 45, 3, 221, 7, 2, 2016, 1, 17, 398, 1, 264, 8049, 90, 5891, 1, 2, 3, 6858, 6, 3, 7, 11, 1, 8, 337, 7, 164, 4, 7, 330, 761, 4, 7542, 2, 251, 147, 3, 588, 52, 65, 9, 8, 194, 624, 28, 63, 5, 19, 1, 4, 444, 8, 194, 643, 22, 19, 611, 1740, 4, 60, 1966, 9, 3106, 22, 6, 392, 4, 6, 417, 392, 390, 7, 43, 918, 164, 1048]),
       list([33, 31, 8, 40, 194, 23, 22, 3421, 409, 2798, 78, 77, 2, 6325, 1230, 6810, 100, 1, 11, 6810, 255, 3991, 13, 973, 7, 621, 20, 124, 1608, 5, 2, 5480, 79, 8067, 15, 156, 37, 105, 11, 535, 1025, 24, 1618, 6, 1291, 2044, 15, 2, 3, 2145, 11, 25, 611, 1, 1, 4, 6193, 26, 1796, 6, 316, 1608, 1, 18, 4, 2066, 1957, 4, 6, 5033, 4, 6, 316, 3275, 130, 6, 561, 5762, 4, 6874, 6302, 100, 6810, 234, 10, 649, 897, 1, 73, 27, 445, 3, 11, 1957, 2, 4860, 72, 7, 35, 409, 1406, 1574, 483, 1957, 4006, 48, 4, 1165, 1957, 48, 131, 1041, 

In [128]:
OOV_BUCKETS = 5
EMBED_SIZE  = 8

In [129]:
rnn_model = keras.models.Sequential([
    keras.layers.Embedding(VOCAB_SIZE + OOV_BUCKETS, EMBED_SIZE, input_shape=[None]),
    keras.layers.GRU(4, return_sequences=True),
    keras.layers.GRU(4),
    keras.layers.Dense(1, activation="sigmoid")
])

In [53]:
# rnn_model = keras.models.Sequential([
#     keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
#     dropout=0.2, recurrent_dropout=0.2),
#     keras.layers.GRU(128, return_sequences=True,
#     dropout=0.2, recurrent_dropout=0.2),
#     keras.layers.TimeDistributed(keras.layers.Dense(max_id,
#     activation="softmax"))
# ])

In [130]:
rnn_model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, None, 8)           8040      
                                                                 
 gru_24 (GRU)                (None, None, 4)           168       
                                                                 
 gru_25 (GRU)                (None, 4)                 120       
                                                                 
 dense_12 (Dense)            (None, 1)                 5         
                                                                 
Total params: 8,333
Trainable params: 8,333
Non-trainable params: 0
_________________________________________________________________


In [131]:
rnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [132]:
es = EarlyStopping(monitor='val_loss', patience=5) 

In [133]:
history = rnn_model.fit(x_train_rnn_dense, y_train_rnn, batch_size=64, epochs=5, callbacks=[es])

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [84]:
# history = rnn_model.fit(x_train_rnn_dense, y_train_rnn, epochs=20)

Epoch 1/20


 139/1094 [==>...........................] - ETA: 1:44:01 - loss: 0.6936 - accuracy: 0.5067

KeyboardInterrupt: 

In [None]:
# order x_train which is a matrix, order by on the first column and then on the second column
x_train_sorted = x_train[np.lexsort((x_train[:,1], x_train[:,0]))]
print(x_train_sorted[-30:])

In [None]:
rnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = rnn_model.fit(x_train, epochs=5)

In [135]:
train_score = rnn_model.evaluate(x_train_rnn,
                       y_train_rnn,
                       verbose=1)
test_score = rnn_model.evaluate(x_test_rnn,
                       y_test_rnn,
                       verbose=1)
labels = rnn_model.metrics_names

print('')
print(f'Training Accuracy: {train_score[1]}')
print(f'Testing Accuracy: {test_score[1]}')


Training Accuracy: 0.7138000130653381
Testing Accuracy: 0.6872666478157043


In [136]:
y_pred = rnn_model.predict(x_test_rnn)



In [138]:
y_pred_binario = np.where(y_pred > 0.5, 1, 0)

accuracy = accuracy_score(y_test_rnn, y_pred_binario)
recall = recall_score(y_test_rnn, y_pred_binario)
f1 = f1_score(y_test_rnn, y_pred_binario,)
precision = precision_score(y_test_rnn, y_pred_binario)

print("Accuracy: "  + str(accuracy))
print("Recall: "    + str(recall))
print("Precision: " + str(precision))
print("f1 score: "  + str(f1))

Accuracy: 0.6872666666666667
Recall: 0.7048359467650547
Precision: 0.6857692307692308
f1 score: 0.6951718760153357


## Ensamble