In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from deap import base, creator, tools, algorithms
import random

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OMEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def load_data():
    column_names = ["sentiment", "id", "date", "query", "username", "text"]
    dataset = pd.read_csv("C:/Users/OMEN/Documents/sem6/otaiml/training.1600000.processed.noemoticon.csv",
                          encoding='latin-1', names=column_names, header=None)
    dataset = dataset[["sentiment", "text"]]
    dataset["sentiment"] = dataset["sentiment"].replace({0: 0, 4: 1})  # Convert 4 (positive) to 1
    dataset.dropna(inplace=True)
    dataset.drop_duplicates(inplace=True)
    dataset = dataset[dataset["text"].str.split().str.len() >= 3]  # Remove very short tweets
    return dataset

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Keep only letters
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


In [6]:
def preprocess_data(dataset):
    dataset['text'] = dataset['text'].apply(clean_text)
    return dataset

In [7]:
def train_word2vec(sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
    return model

In [8]:
def aco_feature_selection(X, y):
    def evaluate(individual):
        selected_features = [index for index, value in enumerate(individual) if value == 1]
        if len(selected_features) == 0:
            return (0,)  # Avoid division by zero
        X_selected = X[:, selected_features]
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

        model = tf.keras.Sequential([
            Dense(10, activation='relu', input_shape=(X_selected.shape[1],)),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=5, batch_size=64, verbose=0)
        
        _, accuracy = model.evaluate(X_test, y_test, verbose=0)
        return (accuracy,)
    if "FitnessMax" not in creator.__dict__:
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    if "Individual" not in creator.__dict__:
        creator.create("Individual", list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)
    toolbox.register("evaluate", evaluate)

    population = toolbox.population(n=20)
    algorithms.eaMuPlusLambda(population, toolbox, mu=20, lambda_=40, cxpb=0.5, mutpb=0.2, ngen=10, verbose=True)
    
    best_individual = tools.selBest(population, 1)[0]
    return [index for index, value in enumerate(best_individual) if value == 1]

In [9]:
def train_lstm(X_train, X_test, y_train, y_test, vocab_size, max_length):
    model = Sequential([
        Embedding(vocab_size, 100, input_length=max_length),
        SpatialDropout1D(0.2),
        LSTM(100, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

    # Predict and evaluate performance
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return model

In [10]:
dataset = load_data()
dataset = preprocess_data(dataset)

# Tokenize text for Word2Vec
tokenized_texts = [nltk.word_tokenize(text) for text in dataset['text']]

# Train Word2Vec model
word2vec_model = train_word2vec(tokenized_texts)

In [11]:
X = np.array([
    np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) 
    for words in tokenized_texts
])

y = np.array(dataset['sentiment'])

In [12]:
selected_features = aco_feature_selection(X, y)
X_selected = X[:, selected_features]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


gen	nevals
0  	20    
1  	30    
2  	26    
3  	31    
4  	26    
5  	24    
6  	29    
7  	26    
8  	26    
9  	29    
10 	27    


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [14]:
train_lstm(X_train, X_test, y_train, y_test, vocab_size=len(word2vec_model.wv), max_length=X_selected.shape[1])




Epoch 1/5
19462/19462 - 2238s - 115ms/step - accuracy: 0.5792 - loss: 0.6704 - val_accuracy: 0.6049 - val_loss: 0.6563
Epoch 2/5
19462/19462 - 2214s - 114ms/step - accuracy: 0.6064 - loss: 0.6539 - val_accuracy: 0.6144 - val_loss: 0.6460
Epoch 3/5
19462/19462 - 2256s - 116ms/step - accuracy: 0.6135 - loss: 0.6468 - val_accuracy: 0.6202 - val_loss: 0.6400
Epoch 4/5
19462/19462 - 2259s - 116ms/step - accuracy: 0.6181 - loss: 0.6427 - val_accuracy: 0.6267 - val_loss: 0.6358
Epoch 5/5
19462/19462 - 2250s - 116ms/step - accuracy: 0.6218 - loss: 0.6393 - val_accuracy: 0.6297 - val_loss: 0.6325
[1m9731/9731[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 13ms/step
Accuracy: 0.6297
Precision: 0.6242
Recall: 0.6544
F1 Score: 0.6389


<Sequential name=sequential_294, built=True>