# Exercise 6

## Predict rating using LSTM


In [2]:
import pandas as pd

In [3]:
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)

In [4]:
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [5]:
plots

3107    most is the story of a single father who takes...
900     a serial killer decides to teach the secrets o...
6724    in sweden ,  a female blackmailer with a disfi...
4704    in a friday afternoon in new york ,  the presi...
2582    in los angeles ,  the editor of a publishing h...
                              ...                        
8417    " our marriage ,  their wedding .  "  it ' s l...
1592    the wandering barbarian ,  conan ,  alongside ...
1723    like a tale spun by scheherazade ,  kismet fol...
7605    mrs .  brisby ,  a widowed mouse ,  lives in a...
215     tinker bell journey far north of never land to...
Name: plot, Length: 7895, dtype: object

In [6]:
y

3107    1
900     0
6724    1
4704    1
2582    1
       ..
8417    0
1592    0
1723    0
7605    1
215     1
Name: rating, Length: 7895, dtype: int64

# Exercise 6.1

- Remove stopwords
- Lowercase
- split the text in words
- pad_sequences

In [7]:
pip install nltk keras



In [8]:
import numpy as np
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

# Descarga lista de stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocesamiento de texto
def preprocess_text(text):
    # Tokenización (split the text into words)
    words = text.split()

    # Elimina stopwords y se convierte a minúsculas
    words = [word.lower() for word in words if word.lower() not in stop_words]

    return ' '.join(words)

# Aplicar preprocesamiento a los plots
plots = plots.apply(preprocess_text)

# Tokenización y secuencia
max_words = 1000000  # Número máximo de palabras a considerar en el vocabulario
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(plots)
sequences = tokenizer.texts_to_sequences(plots)
max_sequence_length = max([len(seq) for seq in sequences])

# Padding
X = pad_sequences(sequences, maxlen=max_sequence_length)

print("Shape of X:", X.shape)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Shape of X: (7895, 850)


# Exercise 6.2

Create a SimpleRNN neural network to predict the rating of a movie

Calculate the testing set accuracy

In [9]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Divición de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:

# Creación de modelo SimpleRNN
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
model.add(SimpleRNN(128,return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(1, activation='sigmoid'))

# Compile modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entrenamiento del modelo
model.fit(X_train, y_train, epochs=4, batch_size=64, validation_split=0.2)

# Evaluación del modelo en conjunto test
y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.5).astype(int)

# Calcula la precisión en test
test_accuracy = accuracy_score(y_test, y_pred)
print("Precisión en el conjunto de prueba:", test_accuracy)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Precisión en el conjunto de prueba: 0.5142495250158328


# Exercise 6.3

Create a LSTM neural network to predict the rating of a movie

Calculate the testing set accuracy

In [15]:
from keras.layers import Embedding, LSTM, Dense

# Creación de modelo LSTM
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

# Compila el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entrena el modelo
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

# Evalua el modelo en test
y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.5).astype(int)

# Calcula la precisión en test
test_accuracy = accuracy_score(y_test, y_pred)
print("Precisión en el conjunto de prueba:", test_accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Precisión en el conjunto de prueba: 0.5959468017732742


# Exercise 6.4

Create a GRU neural network to predict the rating of a movie

Calculate the testing set accuracy

In [None]:
from keras.layers import Embedding, GRU, Dense

# Creación de modelo GRU
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid'))

# Compila modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entrena el modelo
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

# Evalua el modelo en test
y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.5).astype(int)

# Calcular la precisión en test
test_accuracy = accuracy_score(y_test, y_pred)
print("Precisión en el conjunto de prueba:", test_accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Precisión en el conjunto de prueba: 0.5503483217226093


# **Conclusiones**

En las tres redes neuronales se probaron diferentes combinaciones de parámetros como: número de neuronas, agregar capas adicionales y batch_size.


*   Únicamente en la red RNN funcionó agregar otra capa de 64 neuronas a la capa inicial de 128, en las otras, disminuía el resultado de las métricas.
*   Los batch_size que mejor funcionaron fueron de 128 y 64.
*   Agregar más épocas no hacía la diferencia en el resultado obtenido.



Los resultados de las 3 redes no superan el 0.6 a pesar de hacer varias pruebas con cambios y combinaciones en los parámetros.

La red que mejores resultados tuvo fue la LSTM, seguido por la GRU.
