In [1]:

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

In [5]:
df = pd.read_csv("E:\\nlp_project\\sentiment_analysis.csv", encoding="ISO-8859-1", )

In [3]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user @ references
    text = re.sub(r'\@\w+','', text)
    
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

In [6]:
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)
df['cleaned_text'] = df['text'].apply(clean_text)

In [7]:
max_features = 5000
max_len = 50

In [8]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['cleaned_text'])
X = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(X, maxlen=max_len)

y = df['target'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=1,
    min_lr=0.00001
)
callbacks = [early_stopping, reduce_lr]

In [11]:
lstm_model = Sequential([
    Embedding(max_features, 64, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [13]:
lstm_history = lstm_model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=64, 
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [14]:
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
print(f"\nLSTM Test Accuracy: {lstm_accuracy:.4f}")


LSTM Test Accuracy: 0.7842


In [15]:
cnn_model = Sequential([
    Embedding(max_features, 64, input_length=max_len),
    Conv1D(64, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [16]:
cnn_history = cnn_model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=64, 
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [17]:
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test, y_test, verbose=0)
print(f"\nCNN Test Accuracy: {cnn_accuracy:.4f}")


CNN Test Accuracy: 0.7800


In [None]:
hybrid_model = Sequential([
    Embedding(max_features, 64, input_length=max_len),
    Conv1D(32, 5, activation='relu'),
    Reshape((max_len-4, 32)),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [19]:
hybrid_history = hybrid_model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=64, 
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [20]:
hybrid_loss, hybrid_accuracy = hybrid_model.evaluate(X_test, y_test, verbose=0)
print(f"\nHybrid LSTM-CNN Test Accuracy: {hybrid_accuracy:.4f}")


Hybrid LSTM-CNN Test Accuracy: 0.7814
