## Using Tradition ML Methods

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

import nltk
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from wordcloud import WordCloud

sw = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df = pd.read_csv(r'movies_sentiment_data.csv')
display(df.head(1))
display(df.shape)

In [None]:
df['sentiment'] = np.where(df['sentiment']=='positive',1 ,0)

In [None]:
df['sentiment'].value_counts()

In [None]:
def clean_reviews(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+','',text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = re.sub(r'\d+','' ,text)
    text = ' '.join([word for word in text.split() if word not in sw])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
df['clean_reviews'] = df['review'].apply(clean_reviews)
df.head(1)

In [None]:
word_cloud_text = ' '.join(df['clean_reviews'])
wc = WordCloud(background_color='white').generate(word_cloud_text)
plt.figure(figsize=(15,15))
plt.imshow(wc);

In [None]:
X = df['clean_reviews']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

In [None]:
def train_model(vectorizer, classifier, X_train, y_train):
    model = Pipeline([
        ('vec', vectorizer),
        ('clf', classifier)
    ])

    display(model.fit(X_train, y_train))
    y_pred = cross_val_predict(model, X_test, y_test,cv = 5)
    print(classification_report(y_pred, y_test))
    print(confusion_matrix(y_pred, y_test))
    return model

In [None]:
model1 = train_model(CountVectorizer(), RandomForestClassifier(), X_train, y_train)

In [None]:
model2 = train_model(TfidfVectorizer(), RandomForestClassifier(), X_train, y_train)

In [None]:
model3 = train_model(CountVectorizer(), MultinomialNB(), X_train, y_train)

In [None]:
model4 = train_model(TfidfVectorizer(), MultinomialNB(), X_train, y_train)

## Using ANN

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv(r'movies_sentiment_data.csv')
display(df.head(1))
display(df.shape)

In [None]:
df['sentiment'] = np.where(df['sentiment']=='positive',1,0)
df['sentiment'].value_counts()

In [None]:
def clean_reviews(text):
    # Remove HTML tags from the text using BeautifulSoup
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Remove URLs (starting with http, https, or www) from the text
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses (anything that starts with '@') from the text
    text = re.sub(r'@\w+', '', text)
    
    # Remove special characters (anything other than letters, numbers, and spaces) from the text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove any digits from the text
    text = re.sub(r'\d+', '', text)
    
    # Remove stop words from the text (words that are common and provide little meaning)
    # 'sw' is a predefined list of stop words
    text = ' '.join([word for word in text.split() if word not in sw])
    
    # Apply stemming to reduce words to their root form (using a stemmer like PorterStemmer)
    # 'stemmer' is a predefined stemmer object
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # Remove extra spaces (more than one space) and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
df['cleaned_reviews'] = df['review'].apply(clean_reviews)

In [None]:
tokenizer = Tokenizer(num_words=10000)

In [None]:
tokenizer.fit_on_texts(X)

In [None]:
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq,100)

In [None]:
X_pad.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
ann_model = keras.Sequential([
    keras.layers.Embedding(10000, 100),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.5),
    
    keras.layers.Dense(256, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.5),
    
    keras.layers.Dense(128, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.5),
    
    keras.layers.Dense(1, activation='sigmoid')
])

ann_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',patience=5, restore_best_weights=True)

history = ann_model.fit(X_train, y_train,
                        epochs=20, batch_size = 32,
                        callbacks=[early_stop],
                        validation_split=0.2)

In [None]:
plt.plot(history.history['val_loss']);
plt.plot(history.history['loss']);

plt.legend(['val_loss','loss']);

In [None]:
plt.plot(history.history['val_accuracy']);
plt.plot(history.history['accuracy']);

plt.legend(['val_accuracy','accuracy']);

## LSTM Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Define the LSTM model
lstm_model = Sequential([
    keras.layers.Embedding(input_dim = 10000, output_dim = 128),
    LSTM(512, return_sequences=True),
    BatchNormalization(),
    Dropout(0.5),
    
    LSTM(256, return_sequences=True),
    BatchNormalization(),
    Dropout(0.5),
    
    LSTM(128, return_sequences=False),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Define early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = lstm_model.fit(X_train, y_train, epochs=30, batch_size=32, callbacks=[early_stop], validation_split=0.2)

In [None]:
plt.plot(history.history['val_loss']);
plt.plot(history.history['loss']);

plt.legend(['val_loss','loss']);

In [None]:
plt.plot(history.history['val_accuracy']);
plt.plot(history.history['accuracy']);

plt.legend(['val_accuracy','accuracy']);

## Using BERT

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import regex as re
from tensorflow import keras
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, TFAutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')
import spacy

from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

In [None]:
df = pd.read_csv(r"C:\Users\Atharva\Desktop\rxib\NLP codebasics\movies_sentiment_data.csv")
display(df.head(1))
display(df.shape)

In [None]:
df['sentiment'] = np.where(df['sentiment']=='positive',1,0)
df['sentiment'].value_counts()

In [None]:
doc = spacy.load('en_core_web_sm')
def clean_reviews(text):
    # Remove HTML tags from the text using BeautifulSoup
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Remove URLs (starting with http, https, or www) from the text
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses (anything that starts with '@') from the text
    text = re.sub(r'@\w+', '', text)
    
    # Remove special characters (anything other than letters, numbers, and spaces) from the text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove any digits from the text
    text = re.sub(r'\d+', '', text)
    
    # Remove stop words from the text (words that are common and provide little meaning)
    # 'sw' is a predefined list of stop words
    text = ' '.join([word for word in text.split() if word not in sw])
    
    # Apply stemming to reduce words to their root form (using a stemmer like PorterStemmer)
    # 'stemmer' is a predefined stemmer object
#     doc = spacy.load('en_core_web_sm')
    text = doc(text)
    text = " ".join([token.lemma_ for token in text])
#     text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # Remove extra spaces (more than one space) and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
df['cleaned_reviews'] = df['review'].apply(clean_reviews)

In [None]:
model = TFAutoModel.from_pretrained('bert-base-uncased')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize(text, max_length = 128):
    tokenized_text = tokenizer(text, padding = 'max_length', truncation=True, return_tensors = 'tf',max_length=max_length)
    return tokenized_text['input_ids'], tokenized_text['token_type_ids'],tokenized_text['attention_mask']  

In [None]:
input_ids, token_type_ids, attention_masks = [],  [], [] 

for text in df['cleaned_reviews']:
    input_ids.append(tokenize(text)[0])
    token_type_ids.append(tokenize(text)[1])
    attention_masks.append(tokenize(text)[2])

In [None]:
input_ids = tf.squeeze(tf.convert_to_tensor(input_ids), axis=1)
token_type_ids = tf.squeeze(tf.convert_to_tensor(token_type_ids), axis=1)
attention_masks = tf.squeeze(tf.convert_to_tensor(attention_masks), axis=1)

In [None]:
# Create a TensorFlow dataset from inputs and labels
labels = df['sentiment']

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': input_ids,
    'token_type_ids': token_type_ids,
    'attention_mask': attention_masks
}, labels))

In [None]:
# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(10000).batch(16)  # Adjust batch size as necessary

In [None]:
# Define the custom BERT classifier
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, inputs):
        outputs = self.bert(inputs)
        pooled_output = outputs[1]
        return self.fc(pooled_output)


In [None]:
classifier = BERTForClassification(model)

In [None]:
# Compile the model
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

# Train the model
history = classifier.fit(
    train_dataset,
    epochs=3
)