In [21]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
import joblib

In [22]:
# Step 1: Load the dataset
data_path = r'C:\Users\anura\Desktop\Project 4- Climate Change Modeling\data\raw\climate_nasa.csv'
data = pd.read_csv(data_path)

In [23]:
# Step 2: Extract text data for sentiment analysis
data = data.dropna(subset=['text', 'commentsCount'])

# Create multiclass labels based on commentsCount
def generate_sentiment_label(count):
    if count < 0:
        return 0  # Negative sentiment
    elif count == 0:
        return 1  # Neutral sentiment
    else:
        return 2  # Positive sentiment

data['sentiment'] = data['commentsCount'].apply(generate_sentiment_label)


In [None]:
# Step 3: Preprocess the text
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    text = " ".join(word for word in text.split() if word not in stop_words)  
    return text

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
# Step 4: Split data into training and testing sets
X = data['cleaned_text']
y = data['sentiment']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
# Step 5: Tokenize text for Word2Vec
X_train_tokenized = [word_tokenize(text) for text in X_train]
X_test_tokenized = [word_tokenize(text) for text in X_test]

In [28]:
# Step 6: Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_train_tokenized, vector_size=300, window=5, min_count=1, workers=4)
print("Word2Vec model trained.")


Word2Vec model trained.


In [29]:
# Step 7: Create sentence embeddings
def generate_word2vec_embedding(tokens, word2vec_model):
    embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size) 

X_train_embeddings = np.array([generate_word2vec_embedding(tokens, word2vec_model) for tokens in X_train_tokenized])
X_test_embeddings = np.array([generate_word2vec_embedding(tokens, word2vec_model) for tokens in X_test_tokenized])


In [30]:
# Step 8: Define the RNN Model
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)  
        _, (hidden, _) = self.rnn(x)
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1) 
        output = self.fc(hidden_cat)
        return self.softmax(output)


In [31]:
# Step 9: Prepare Data for RNN
class SentimentDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

train_dataset = SentimentDataset(X_train_embeddings, y_train)
test_dataset = SentimentDataset(X_test_embeddings, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [33]:
# Step 10: Train the RNN
input_dim = word2vec_model.vector_size
hidden_dim = 128
output_dim = 3  

model = RNNClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for embeddings, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}")


Epoch [1/50], Loss: 7.6590
Epoch [2/50], Loss: 7.4934
Epoch [3/50], Loss: 7.3010
Epoch [4/50], Loss: 7.0644
Epoch [5/50], Loss: 6.7676
Epoch [6/50], Loss: 6.4020
Epoch [7/50], Loss: 5.9783
Epoch [8/50], Loss: 5.5206
Epoch [9/50], Loss: 5.0835
Epoch [10/50], Loss: 4.7159
Epoch [11/50], Loss: 4.4428
Epoch [12/50], Loss: 4.2583
Epoch [13/50], Loss: 4.1390
Epoch [14/50], Loss: 4.0630
Epoch [15/50], Loss: 4.0137
Epoch [16/50], Loss: 3.9818
Epoch [17/50], Loss: 3.9592
Epoch [18/50], Loss: 3.9425
Epoch [19/50], Loss: 3.9306
Epoch [20/50], Loss: 3.9208
Epoch [21/50], Loss: 3.9137
Epoch [22/50], Loss: 3.9067
Epoch [23/50], Loss: 3.9024
Epoch [24/50], Loss: 3.8981
Epoch [25/50], Loss: 3.8947
Epoch [26/50], Loss: 3.8916
Epoch [27/50], Loss: 3.8891
Epoch [28/50], Loss: 3.8867
Epoch [29/50], Loss: 3.8847
Epoch [30/50], Loss: 3.8828
Epoch [31/50], Loss: 3.8813
Epoch [32/50], Loss: 3.8798
Epoch [33/50], Loss: 3.8785
Epoch [34/50], Loss: 3.8774
Epoch [35/50], Loss: 3.8763
Epoch [36/50], Loss: 3.8753
E

In [34]:
# Step 11: Evaluate the RNN
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for embeddings, labels in test_loader:
        outputs = model(embeddings)
        _, preds = torch.max(outputs, dim=1)
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds))
print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))


Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           2       1.00      1.00      1.00        49

    accuracy                           1.00        49
   macro avg       1.00      1.00      1.00        49
weighted avg       1.00      1.00      1.00        49


Confusion Matrix:
[[49]]


In [35]:
# Step 12: Save the trained model
model_dir = r'C:\Users\anura\Desktop\Project 4- Climate Change Modeling\models'
os.makedirs(model_dir, exist_ok=True)
model_save_path = os.path.join(model_dir, 'rnn_multiclass_sentiment_model.pth')
torch.save(model.state_dict(), model_save_path)
print(f"RNN Multiclass Model saved at {model_save_path}")

RNN Multiclass Model saved at C:\Users\anura\Desktop\Project 4- Climate Change Modeling\models\rnn_multiclass_sentiment_model.pth
