In [21]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.layers import LSTM, Bidirectional, SpatialDropout1D, GlobalAveragePooling1D
from keras.layers import Conv2D, MaxPooling2D, Flatten, MaxPooling1D
from keras.layers import Reshape


In [22]:
# Load the Sentiment140 dataset
columns = ["polarity", "id", "date", "query", "username", "text"]
df = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", names=columns)

# Drop unnecessary columns
df = df.drop(["id", "date", "query", "username"], axis=1)
df

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [23]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove mentions
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)  # Remove special characters
    return text.lower().strip()

df['clean_text'] = df['text'].apply(clean_text)

# Tokenization and removing stopwords
stop_words = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

df['tokenized_text'] = df['clean_text'].apply(tokenize_text)
df

Unnamed: 0,polarity,text,clean_text,tokenized_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that s a bummer you shoulda got david car...,"[awww, bummer, shoulda, got, david, carr, thir..."
1,0,is upset that he can't update his Facebook by ...,is upset that he can t update his facebook by ...,"[upset, update, facebook, texting, might, cry,..."
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...,"[dived, many, times, ball, managed, save, 50, ..."
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[whole, body, feels, itchy, like, fire]"
4,0,"@nationwideclass no, it's not behaving at all....",no it s not behaving at all i m mad why am i h...,"[behaving, mad, see]"
...,...,...,...,...
1599995,4,Just woke up. Having no school is the best fee...,just woke up having no school is the best feel...,"[woke, school, best, feeling, ever]"
1599996,4,TheWDB.com - Very cool to hear old Walt interv...,thewdb com very cool to hear old walt interviews,"[thewdb, com, cool, hear, old, walt, interviews]"
1599997,4,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover ask me fo...,"[ready, mojo, makeover, ask, details]"
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time tup...,"[happy, 38th, birthday, boo, alll, time, tupac..."


In [25]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_text'], df['polarity'], test_size=0.2, random_state=42)


In [26]:
max_words = 10000
max_len = 100
embedding_dim = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Encode polarity labels
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [27]:
# Define multiple CNN architectures
def create_model_1():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model



def create_model_2():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model


def create_model_3():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(MaxPooling1D(3))
    model.add(Conv1D(128, 4, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model



def create_model_4():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(Conv1D(64, 2, activation='relu'))
    model.add(Conv1D(64, 2, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Conv1D(128, 2, activation='relu'))
    model.add(Conv1D(128, 2, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

def model5Conv2D():
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    # Reshape the input to have a 2D structure suitable for Conv2D
    model.add(Reshape((max_len, embedding_dim, 1)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [28]:
# Train and evaluate models
models = [create_model_1, create_model_2, create_model_3, create_model_4, model5Conv2D]

for i, create_model_func in enumerate(models):
    print(f"Training Model {i+1}")
    model = create_model_func()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train_pad, y_train_enc, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test_enc))

    loss, accuracy = model.evaluate(X_test_pad, y_test_enc)
    print(f"Test Accuracy for Model {i+1}: {accuracy}")

    y_pred_prob = model.predict(X_test_pad)
    y_pred = (y_pred_prob > 0.5).astype(int)
    y_pred = label_encoder.inverse_transform(y_pred.flatten())

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Precision for Model {i+1}: {precision}")
    print(f"Recall for Model {i+1}: {recall}")
    print(f"F1 Score for Model {i+1}: {f1}")

Training Model 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy for Model 1: 0.7768281102180481
Precision for Model 1: 0.7774413061743503
Recall for Model 1: 0.776828125
F1 Score for Model 1: 0.7766808475772624
Training Model 2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy for Model 2: 0.7815968990325928
Precision for Model 2: 0.7818304855343212
Recall for Model 2: 0.781596875
F1 Score for Model 2: 0.7815648270196535
Training Model 3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy for Model 3: 0.7352937459945679
Precision for Model 3: 0.735982690856904
Recall for Model 3: 0.73529375
F1 Score for Model 3: 0.735063926914758
Training Model 4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy for Model 4: 0.6559218764305115
Precision for Model 4: 0.6648366747063427
Recall for Model 4: 0.655921875
F1 Score for Model 4: 0.650941674897976
Training Model 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy for Model 5

  _warn_prf(average, modifier, msg_start, len(result))
