In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, SpatialDropout1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import random

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset
columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
df = pd.read_csv('/content/drive/MyDrive/sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin-1', names=columns)

# Select only the 'sentiment' and 'text' columns
df = df[['sentiment', 'text']]

# Lowercasing
df['text'] = df['text'].str.lower()

# Removing URLs
df['text'] = df['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))

# Removing special characters and numbers
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Tokenization
df['text'] = df['text'].apply(word_tokenize)

# Removing stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Combining tokens back into sentences
df['text'] = df['text'].apply(lambda x: ' '.join(x))

# Tokenize the preprocessed text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Train/test split
df['sentiment'] = df['sentiment'].map(lambda x: 1 if x == 4 else 0)
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=[sentence.split() for sentence in df['text']], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train([sentence.split() for sentence in df['text']], total_examples=word2vec_model.corpus_count, epochs=10)

# Create embedding matrix
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, weights=[embedding_matrix], input_length=100, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Conv1D(filters=256, kernel_size=4, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Conv1D(filters=256, kernel_size=5, activation='relu'))
model.add(BatchNormalization())
model.add(GlobalMaxPooling1D())
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=(X_test, y_test), callbacks=[reduce_lr, early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

# Select 10 random sentences from the test dataset
random_indices = random.sample(range(X_test.shape[0]), 10)
X_sample = X_test[random_indices]
y_sample = y_test[random_indices]

# Predict the labels for the selected sentences
y_pred_sample = model.predict(X_sample)

# Print the results
for i in range(10):
    print(f"Sentence: {df['text'].iloc[random_indices[i]]}")
    print(f"Predicted Sentiment: {round(y_pred_sample[i][0])}, True Sentiment: {y_sample[i]}\n")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.7956968545913696
Sentence: feeling really sick today
Predicted Sentiment: 0, True Sentiment: 0

Sentence: also hope didnt freak alice wasnt intentional
Predicted Sentiment: 1, True Sentiment: 1

Sentence: ledoug actually planning working wth ala meeting kaiboshed plan
Predicted Sentiment: 0, True Sentiment: 1

Sentence: hey xonline nooo miss rob xlive live gt
Predicted Sentiment: 1, True Sentiment: 1

Sentence: voodoo met really cool person bus stop lol
Predicted Sentiment: 0, True Sentiment: 0

Sentence: already dreading going back work tuesday
Predicted Sentiment: 0, True Sentiment: 0

Sentence: sad everyone left houston
Predicted Sentiment: 1, True Sentiment: 0

Sentence: audreypanda alright feel better get better soon maybe caught swine f