## Implementação CNN para Reviews

#### 1. Configuração de Ambiente

#### 1.1 Importação de bibliotecas necessárias

In [None]:
%pip install -r neural_network_req.txt

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import re
import time
import string
import numpy as np
import pandas as pd
import nltk

import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from keras.layers import TextVectorization
from nltk.tokenize import word_tokenize
from nltk import FreqDist, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from keras.utils import pad_sequences

#### 1.2. Variáveis de Ambiente

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

dir_paths = [
    '../dist/reviews/negativos',
    '../dist/reviews/positivos'
]

In [None]:
# df = pd.read_csv('/content/reviews.csv')
# df.shape

total_files = 0

for path in dir_paths: total_files += len(os.listdir(path))

df = pd.DataFrame(data=[],columns=['review', 'sentiment'], index=list(range(0, total_files)))

for path in dir_paths:

    for filepath in os.listdir(path):
        index = int(filepath.split('-')[0])

        if ('negative' in filepath): sentiment = 'negative'
        else: sentiment = 'positive'

        with open(os.path.join(path, filepath), 'r', encoding='utf-8') as file:
            content = file.read().rstrip('\n')
            df.iloc[index] = [content, sentiment]


In [None]:
df.head()

In [None]:
emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

def remove_emoji(text):
  return emoji_clean.sub(r'',text)

df['review'] = df['review'].apply(remove_emoji)

In [None]:
pattern = r'(<br>|<br />|http)'
df['review'] = df['review'].str.replace(pattern, "", regex=True)

In [None]:
def remove_symbols_and_numbers(text):
    cleaned_text = re.sub(r'[^\w\s]|[\d]', '', text)
    return cleaned_text.lower()

df['review'] = df['review'].apply(remove_symbols_and_numbers)

In [None]:
df['review'] = df['review'].apply(word_tokenize)

In [None]:
stop_words = stopwords.words('english')

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token.lower() not in stop_words]

df['review'] = df['review'].apply(remove_stopwords)

In [None]:
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

df['review'] = df['review'].apply(lemmatize_text)

In [None]:
def join_tokens(tokens):
  return ' '.join(tokens);

df['review'] = df['review'].apply(join_tokens)

In [None]:
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df.sentiment)

In [None]:
vocab_size = 6000
sequence_length = 100
maxlen = 130

x = df['review']
y = df['sentiment_encoded']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

token = Tokenizer()
token.fit_on_texts(X_train)

train_sequences = token.texts_to_sequences(X_train)
valid_sequences = token.texts_to_sequences(X_test)

X_train = pad_sequences(train_sequences, maxlen=maxlen, padding = 'post')
X_test = pad_sequences(valid_sequences, maxlen=maxlen, padding = 'post')

In [None]:
X_train.shape

In [122]:
from keras.models import Model, Sequential
from keras.layers import Bidirectional, GlobalMaxPool1D, AveragePooling1D, Dense, LSTM, Embedding, Dropout
from keras.losses import BinaryCrossentropy

base_model = Sequential()
base_model.add(Embedding(vocab_size, 128))
base_model.add(Bidirectional(LSTM(32, return_sequences = True)))
base_model.add(GlobalMaxPool1D())
base_model.add(Dense(20, activation="relu"))
base_model.add(Dropout(0.05))
base_model.add(Dense(1, activation="sigmoid"))

base_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
base_model.summary()


Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_31 (Embedding)    (None, None, 128)         768000    
                                                                 
 bidirectional_31 (Bidirect  (None, None, 64)          41216     
 ional)                                                          
                                                                 
 global_max_pooling1d_19 (G  (None, 64)                0         
 lobalMaxPooling1D)                                              
                                                                 
 dense_84 (Dense)            (None, 20)                1300      
                                                                 
 dropout_27 (Dropout)        (None, 20)                0         
                                                                 
 dense_85 (Dense)            (None, 1)               

In [None]:
history = base_model.fit(
    X_train, 
    y_train,
    epochs=3,
    validation_data=(X_test, y_test)
).history

In [125]:
relu_model = Sequential()
relu_model.add(Embedding(vocab_size, 128))
relu_model.add(Bidirectional(LSTM(32, return_sequences = True)))
relu_model.add(GlobalMaxPool1D())
relu_model.add(Dense(5, activation='relu'))
relu_model.add(Dropout(0.05))

relu_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
relu_model.summary()

Model: "sequential_36"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_34 (Embedding)    (None, None, 128)         768000    
                                                                 
 bidirectional_34 (Bidirect  (None, None, 64)          41216     
 ional)                                                          
                                                                 
 global_max_pooling1d_20 (G  (None, 64)                0         
 lobalMaxPooling1D)                                              
                                                                 
 dense_88 (Dense)            (None, 5)                 325       
                                                                 
Total params: 809541 (3.09 MB)
Trainable params: 809541 (3.09 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = relu_model.fit(
    X_train,
    y_train,
    epochs=3,
    validation_data=(X_test, y_test)
).history

In [124]:
softmax_model = Sequential()
softmax_model.add(Embedding(vocab_size, 128))
softmax_model.add(Bidirectional(LSTM(32, return_sequences = True)))
softmax_model.add(Dense(5, activation='softmax'))

softmax_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
softmax_model.summary()

Model: "sequential_35"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_33 (Embedding)    (None, None, 128)         768000    
                                                                 
 bidirectional_33 (Bidirect  (None, None, 64)          41216     
 ional)                                                          
                                                                 
 dense_87 (Dense)            (None, None, 5)           325       
                                                                 
Total params: 809541 (3.09 MB)
Trainable params: 809541 (3.09 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = softmax_model.fit(
    X_train, 
    y_train,
    epochs=3,
    validation_data=(X_test, y_test)
).history

In [None]:
def predict_text(input_text, tokenizer, model, maxlen_seq=128, padding = 'post', truncating = 'post'):
    text = str(input_text)
    sequence = tokenizer.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen = maxlen_seq,
                                                          padding = padding, truncating = truncating)
    predict = model.predict(sequence)
    return predict

In [None]:
predict_text('hands down the best movie i ever watched.', token, base_model)
predict_text('hands down the best movie i ever watched.', token, relu_model)
predict_text('hands down the best movie i ever watched.', token, softmax_model)