In [2]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from deap import base, creator, tools, algorithms
import random
import multiprocessing

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(nltk.corpus.stopwords.words('english'))
from nltk.corpus import wordnet

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def load_balanced_data():
    column_names = ["sentiment", "id", "date", "query", "username", "text"]
    dataset = pd.read_csv("C:/Users/OMEN/Documents/sem6/otaiml/training.1600000.processed.noemoticon.csv",
                          encoding='latin-1', names=column_names, header=None)
    dataset = dataset[["sentiment", "text"]]
    dataset["sentiment"] = dataset["sentiment"].replace({0: 0, 4: 1})  # Convert 4 (positive) to 1
    dataset.dropna(inplace=True)
    dataset.drop_duplicates(inplace=True)

    # ✅ Balance dataset (equal number of positive & negative samples)
    pos_samples = dataset[dataset["sentiment"] == 1]
    neg_samples = dataset[dataset["sentiment"] == 0]
    min_size = min(len(pos_samples), len(neg_samples))
    pos_samples = resample(pos_samples, replace=False, n_samples=min_size, random_state=42)
    neg_samples = resample(neg_samples, replace=False, n_samples=min_size, random_state=42)

    # Merge balanced dataset & shuffle
    dataset = pd.concat([pos_samples, neg_samples]).sample(frac=1, random_state=42)
    return dataset

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Keep only letters
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [6]:
def preprocess_data(dataset):
    dataset['text'] = dataset['text'].apply(clean_text)
    return dataset

In [7]:
def train_word2vec(sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
    return model

In [8]:
def moaco_feature_selection(X, y):
    def evaluate(individual):
        selected_features = [index for index, value in enumerate(individual) if value == 1]
        if len(selected_features) == 0:
            return (0, 1e6)  # Penalize empty selections
        
        X_selected = X[:, selected_features]
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=42)
        
        model = tf.keras.Sequential([
            Dense(16, activation='relu', input_shape=(X_selected.shape[1],)), 
            Dense(8, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=5, batch_size=128, verbose=0)
        
        loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
        return accuracy, len(selected_features)
    
    creator.create("FitnessMulti", base.Fitness, weights=(2.0, -0.5))  # More focus on accuracy
    creator.create("Individual", list, fitness=creator.FitnessMulti)

    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selNSGA2)
    toolbox.register("evaluate", evaluate)

    population = toolbox.population(n=10)  
    algorithms.eaMuPlusLambda(population, toolbox, mu=10, lambda_=20, cxpb=0.5, mutpb=0.2, ngen=10, verbose=True)

    best_individual = tools.selBest(population, 1)[0]
    return [index for index, value in enumerate(best_individual) if value == 1]

In [9]:
def train_lstm(X_train, X_test, y_train, y_test, vocab_size, max_length):
    model = Sequential([
        Embedding(vocab_size, 100, input_length=max_length),
        SpatialDropout1D(0.3),
        LSTM(64, return_sequences=True, dropout=0.4, recurrent_dropout=0.4),
        LSTM(32, dropout=0.4, recurrent_dropout=0.4),
        Dense(16, activation='relu'),
        Dropout(0.3),  # Prevent overfitting
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=1e-5)
    
    model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=(X_test, y_test), verbose=2, callbacks=[lr_scheduler])
    return model

In [10]:
def evaluate_moaco_model(model, X_test, y_test):
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("\n=== Model Evaluation Metrics ===")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    return accuracy, precision, recall, f1

In [11]:
print("Loading dataset...")
dataset = load_balanced_data()
dataset = preprocess_data(dataset)

print("Tokenizing text for Word2Vec...")
tokenized_texts = [nltk.word_tokenize(text) for text in dataset['text']]

print("Training Word2Vec model...")
word2vec_model = train_word2vec(tokenized_texts)

Loading dataset...
Tokenizing text for Word2Vec...
Training Word2Vec model...


In [12]:
print("Converting words to vector representations...")
X = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in tokenized_texts])
y = np.array(dataset['sentiment'])

Converting words to vector representations...


In [13]:
print("Performing MOACO-based feature selection...")
selected_features = moaco_feature_selection(X, y)
X_selected = X[:, selected_features] if selected_features else X  

Performing MOACO-based feature selection...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


gen	nevals
0  	10    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


1  	12    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


2  	13    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

3  	17    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


4  	11    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

5  	15    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


6  	11    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

7  	14    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

8  	16    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

9  	16    


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


10 	12    


In [14]:
print("Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=42)

Splitting dataset...


In [15]:
print("Training LSTM model...")
model = train_lstm(X_train, X_test, y_train, y_test, vocab_size=len(word2vec_model.wv), max_length=X_selected.shape[1])

Training LSTM model...




Epoch 1/20
9878/9878 - 1369s - 139ms/step - accuracy: 0.5743 - loss: 0.6726 - val_accuracy: 0.6018 - val_loss: 0.6573 - learning_rate: 0.0010
Epoch 2/20
9878/9878 - 1289s - 130ms/step - accuracy: 0.5985 - loss: 0.6602 - val_accuracy: 0.6149 - val_loss: 0.6481 - learning_rate: 0.0010
Epoch 3/20
9878/9878 - 1289s - 130ms/step - accuracy: 0.6070 - loss: 0.6543 - val_accuracy: 0.6227 - val_loss: 0.6406 - learning_rate: 0.0010
Epoch 4/20
9878/9878 - 1289s - 130ms/step - accuracy: 0.6145 - loss: 0.6485 - val_accuracy: 0.6271 - val_loss: 0.6355 - learning_rate: 0.0010
Epoch 5/20
9878/9878 - 1288s - 130ms/step - accuracy: 0.6195 - loss: 0.6449 - val_accuracy: 0.6301 - val_loss: 0.6330 - learning_rate: 0.0010
Epoch 6/20
9878/9878 - 1288s - 130ms/step - accuracy: 0.6219 - loss: 0.6432 - val_accuracy: 0.6314 - val_loss: 0.6315 - learning_rate: 0.0010
Epoch 7/20
9878/9878 - 1288s - 130ms/step - accuracy: 0.6237 - loss: 0.6414 - val_accuracy: 0.6328 - val_loss: 0.6306 - learning_rate: 0.0010
Epoch 

In [16]:
print("Evaluating model performance...")
evaluate_moaco_model(model, X_test, y_test)

Evaluating model performance...
[1m9878/9878[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 10ms/step

=== Model Evaluation Metrics ===
Accuracy:  0.6479
Precision: 0.6316
Recall:    0.7098
F1 Score:  0.6684


(0.6479242202775299,
 0.6316308087320311,
 0.7098147902073565,
 0.6684443861802667)