In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
import re
import time
import string
import numpy as np
import pandas as pd
import nltk

import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from keras.layers import TextVectorization
from nltk.tokenize import word_tokenize
from nltk import FreqDist, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from keras.utils import pad_sequences

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('/content/reviews.csv')
df.shape

(50000, 2)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

def remove_emoji(text):
  return emoji_clean.sub(r'',text)

df['review'] = df['review'].apply(remove_emoji)

In [None]:
pattern = r'(<br>|<br />|http)'
df['review'] = df['review'].str.replace(pattern, "", regex=True)

In [None]:
def remove_symbols_and_numbers(text):
    cleaned_text = re.sub(r'[^\w\s]|[\d]', '', text)
    return cleaned_text.lower()

df['review'] = df['review'].apply(remove_symbols_and_numbers)

In [None]:
df['review'] = df['review'].apply(word_tokenize)

In [None]:
stop_words = stopwords.words('english')

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token.lower() not in stop_words]

df['review'] = df['review'].apply(remove_stopwords)

In [None]:
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

df['review'] = df['review'].apply(lemmatize_text)

In [None]:
def join_tokens(tokens):
  return ' '.join(tokens);

df['review'] = df['review'].apply(join_tokens)

In [None]:
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df.sentiment)

In [None]:
vocab_size = 6000
sequence_length = 100
maxlen = 130

In [None]:
x = df['review']
y = df['sentiment_encoded']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

token = Tokenizer()
token.fit_on_texts(X_train)

train_sequences = token.texts_to_sequences(X_train)
valid_sequences = token.texts_to_sequences(X_test)

X_train = pad_sequences(train_sequences, maxlen=maxlen, padding = 'post')
X_test = pad_sequences(valid_sequences, maxlen=maxlen, padding = 'post')

In [None]:
X_train.shape

(40000, 130)

In [None]:
from keras.models import Model, Sequential
from keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Embedding, Dropout

model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 128)         768000    
                                                                 
 bidirectional_3 (Bidirectio  (None, None, 64)         41216     
 nal)                                                            
                                                                 
 global_max_pooling1d_3 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 20)                1300      
                                                                 
 dropout_3 (Dropout)         (None, 20)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=3,
    validation_data=(X_test, y_test)
).history

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
def predict_text(input_text, tokenizer, model, maxlen_seq=128, padding = 'post', truncating = 'post'):
    text = str(input_text)
    sequence = tokenizer.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen = maxlen_seq,
                                                          padding = padding, truncating = truncating)
    predict = model.predict(sequence)
    return predict

In [None]:
predict_text('hands down the best movie i ever watched.', token, model)



array([[0.6649942]], dtype=float32)