In [1]:
#Task 1: Transform the Text Data for Sentiment Analysis

# a. Preprocess Text Data (Common for both datasets):

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [20]:
# Encode the sentiments
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(tweets_df['sentiment'])
y = pd.get_dummies(y).values

#Develop and Train the Model:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SpatialDropout1D, Dense

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

In [24]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))  # Assuming 4 classes for sentiment <--- Changed from 3 to 4

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [25]:
# Train the model
model.fit(X, y, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 252ms/step - accuracy: 0.5204 - loss: 1.0958 - val_accuracy: 0.4726 - val_loss: 1.4628
Epoch 2/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 248ms/step - accuracy: 0.8540 - loss: 0.3944 - val_accuracy: 0.4592 - val_loss: 1.8038
Epoch 3/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 249ms/step - accuracy: 0.9018 - loss: 0.2593 - val_accuracy: 0.4575 - val_loss: 2.2387
Epoch 4/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 254ms/step - accuracy: 0.9163 - loss: 0.2115 - val_accuracy: 0.4442 - val_loss: 2.5146
Epoch 5/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 248ms/step - accuracy: 0.9247 - loss: 0.1860 - val_accuracy: 0.4398 - val_loss: 2.5152


<keras.src.callbacks.history.History at 0x7c4576cdd480>

In [26]:
# Tokenize and pad the news headlines

news_sequences = tokenizer.texts_to_sequences(news_df['cleaned_headline'])
news_padded = pad_sequences(news_sequences, maxlen=100)

In [27]:
#Predict Sentiments:

# Predict the sentiment for each news headline
predicted_sentiments = model.predict(news_padded)

# Convert predictions to sentiment labels
predicted_sentiment_labels = label_encoder.inverse_transform(predicted_sentiments.argmax(axis=1))

# Add predicted sentiment labels to the news dataframe
news_df['Predicted Sentiment'] = predicted_sentiment_labels

[1m32768/32768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m948s[0m 29ms/step


In [28]:
#Task 3: Evaluate Sentiment Analysis Models Based on Accuracy

# Evaluate the model
loss, accuracy = model.evaluate(X, y, verbose=1)
print(f'Accuracy on Twitter data: {accuracy}')

[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 30ms/step - accuracy: 0.9322 - loss: 0.1916
Accuracy on Twitter data: 0.8431215286254883
