In [2]:
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/amazon_sports.csv")
df['review_body'] = df['review_body'].str.replace("[^a-zA-ZñÑáéíóú .,]", "", regex=True)
df['review_body'] = df['review_body'].str.lower()
df.head()

Unnamed: 0,stars,review_body,review_title,product_category
0,1,nunca llego el pedido y el vendedor pasa de to...,No llego nunca,sports
1,1,"no sé como es, porque debería haber llegado ay...",Todavía no ha llegado,sports
2,1,"guantes cómodos, no lo niego, pero de mala cal...",Guantes de baja calidad,sports
3,1,hasta hoy no he visto el producto. el pedido h...,Muy Mala experiencia,sports
4,1,"no puedo valorarla porque, después de casi una...",Paquete perdido?,sports


In [3]:
df = df[df.stars != 3]
df['good_product'] = (df.stars > 3).astype(int)

df.groupby('good_product').size()

good_product
0    4989
1    5372
dtype: int64

# Text Classification

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [6]:
X = df.review_body.values
y = df.good_product

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=.2)

vocab_size = 5000
max_length = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

tokenized_X_train = tokenizer.texts_to_sequences(X_train)
tokenized_X_test = tokenizer.texts_to_sequences(X_test)

padded_X_train = pad_sequences(tokenized_X_train, maxlen=max_length, truncating='post')
padded_X_test = pad_sequences(tokenized_X_test, maxlen=max_length, truncating='post')

# Models

In [12]:
import tensorflow.keras as keras
import numpy as np

from keras import Sequential
from keras.layers import *

In [16]:
from time import time

def fit_model(model):
    start = time()
    model.fit(
        padded_X_train,
        y_train,
        epochs=20,
        batch_size=100,
        validation_data=(padded_X_test, y_test),
        verbose=True
    )
    print(f">>>> Elapsed time: {time()-start}s")
    
def compile_model(model):
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics='accuracy'
    )
    print(model.summary())

## Model 1: Dense

In [17]:
keras.utils.set_random_seed(812)

model = Sequential([
    Flatten(input_shape=(max_length,)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid'),
])

In [18]:
compile_model(model)
fit_model(model)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 128)               6528      
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 14849 (58.00 KB)
Trainable params: 14849 (58.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/2

## Model 2: Dense + Embeddings

In [19]:
keras.utils.set_random_seed(812)

embed_dim = 20

model = Sequential([
    Embedding(
        input_dim=vocab_size,
        input_length=max_length,
        output_dim=embed_dim
    ),
    Flatten(),
    Dense(1, activation='sigmoid')
])

In [22]:
compile_model(model)
fit_model(model)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 20)            100000    
                                                                 
 flatten_2 (Flatten)         (None, 1000)              0         
                                                                 
 dense_6 (Dense)             (None, 1)                 1001      
                                                                 
Total params: 101001 (394.54 KB)
Trainable params: 101001 (394.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
>>>> Elapsed time: 5.5202317237854s


## Model 3: CNN + Embeddings

In [30]:
keras.utils.set_random_seed(812)

filters = 64
kernel_size = 5

model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embed_dim,
        input_length=max_length,
    ),
    Conv1D(filters, kernel_size, activation='relu'),
    GlobalAveragePooling1D(),
#     MaxPooling1D(),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [31]:
compile_model(model)
fit_model(model)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 20)            100000    
                                                                 
 conv1d_3 (Conv1D)           (None, 46, 64)            6464      
                                                                 
 global_average_pooling1d_1  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_13 (Dense)            (None, 6)                 390       
                                                                 
 dense_14 (Dense)            (None, 1)                 7         
                                                                 
Total params: 106861 (417.43 KB)
Trainable params: 106861 (417.43 KB)
Non-trainable params: 0 (0.00 Byte)
______________

## Model 4: Bi-LSTM + Embedding

In [36]:
keras.utils.set_random_seed(812)

lstm_dim = 32

model = Sequential([
    Embedding(
        input_dim=vocab_size,
        input_length=max_length,
        output_dim=embed_dim
    ),
#     LSTM(lstm_dim),
    Bidirectional(LSTM(lstm_dim)),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid'),
])

In [37]:
compile_model(model)
fit_model(model)

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 50, 20)            100000    
                                                                 
 bidirectional (Bidirection  (None, 64)                13568     
 al)                                                             
                                                                 
 dense_17 (Dense)            (None, 6)                 390       
                                                                 
 dense_18 (Dense)            (None, 1)                 7         
                                                                 
Total params: 113965 (445.18 KB)
Trainable params: 113965 (445.18 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/2

# Reduccion de overfitting

In [41]:
keras.utils.set_random_seed(812)

filters = 64
kernel_size = 5

model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embed_dim,
        input_length=max_length,
    ),
    Dropout(0.8),
    Conv1D(filters, kernel_size, activation='relu'),
    GlobalAveragePooling1D(),
#     MaxPooling1D(),
    Dropout(0.8),
    BatchNormalization(),
    Dense(6, activation='relu'),
    Dropout(0.8),
    Dense(1, activation='sigmoid')
])

In [42]:
compile_model(model)
fit_model(model)

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 50, 20)            100000    
                                                                 
 dropout_3 (Dropout)         (None, 50, 20)            0         
                                                                 
 conv1d_5 (Conv1D)           (None, 46, 64)            6464      
                                                                 
 global_average_pooling1d_3  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 batch_normalization_1 (Bat  (None, 64)                256       
 chNormalization)                                    

# Tamaño del dataset

In [46]:
df1 = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/amazon_sports.csv")
df2 = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/amazon_electronics.csv")
df3 = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/amazon_home.csv")
df = pd.concat([df1, df2, df3])

df['review_body'] = df['review_body'].str.replace("[^a-zA-ZñÑáéíóú .,]", "", regex=True)
df['review_body'] = df['review_body'].str.lower()
df.shape

(50536, 4)

In [47]:
df = df[df.stars != 3]
df['good_product'] = (df.stars > 3).astype(int)

df.groupby('good_product').size()

good_product
0    20512
1    19817
dtype: int64

In [48]:
X = df.review_body.values
y = df.good_product

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=.2)

print(len(X_train))
print(len(X_test))

vocab_size = 5000
max_length = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

tokenized_X_train = tokenizer.texts_to_sequences(X_train)
tokenized_X_test = tokenizer.texts_to_sequences(X_test)

padded_X_train = pad_sequences(tokenized_X_train, maxlen=max_length, truncating='post')
padded_X_test = pad_sequences(tokenized_X_test, maxlen=max_length, truncating='post')

32263
8066


In [49]:
keras.utils.set_random_seed(812)

lstm_dim = 32

model = Sequential([
    Embedding(
        input_dim=vocab_size,
        input_length=max_length,
        output_dim=embed_dim
    ),
#     LSTM(lstm_dim),
    Bidirectional(LSTM(lstm_dim)),
    Dense(6, activation='relu'),
    Dense(1, activation='sigmoid'),
])

In [50]:
compile_model(model)

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 50, 20)            100000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                13568     
 onal)                                                           
                                                                 
 dense_23 (Dense)            (None, 6)                 390       
                                                                 
 dense_24 (Dense)            (None, 1)                 7         
                                                                 
Total params: 113965 (445.18 KB)
Trainable params: 113965 (445.18 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [51]:
fit_model(model)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
>>>> Elapsed time: 121.56985187530518s
