In [4]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch_geometric.data import Data
from torch_geometric.utils import from_scipy_sparse_matrix, remove_self_loops
from scipy.sparse import coo_matrix

# Load and preprocess the dataset
data = pd.read_csv('train.txt', sep=';', header=None, names=['text', 'label'])
texts = data['text'].tolist()

# Encode labels into numeric format
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(data['label'])

# Generate sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Pre-trained Sentence-BERT
embeddings = model.encode(texts)  # Shape: (n, 384)

# Calculate edge weights using cosine similarity
edge_weights = cosine_similarity(embeddings)
threshold = 0.7  # Define a similarity threshold
adjacency_matrix = (edge_weights > threshold).astype(int)

# Construct graph structure
x = torch.tensor(embeddings, dtype=torch.float)
adj_matrix = coo_matrix(adjacency_matrix)
edge_index, edge_attr = from_scipy_sparse_matrix(adj_matrix)
edge_index, edge_attr = remove_self_loops(edge_index, edge_attr)

# Create graph data in PyTorch Geometric format
graph_data = Data(
    x=x,
    edge_index=edge_index,
    edge_attr=torch.tensor(edge_attr.clone().detach(), dtype=torch.float),
    y=torch.tensor(labels, dtype=torch.long)  # Encoded emotion labels
)

# Create and save the converted dataset for inspection
# Combine embeddings, texts, and labels into a DataFrame
embeddings_df = pd.DataFrame(embeddings)  # Add embeddings as columns
embeddings_df['text'] = texts  # Add original text
embeddings_df['label'] = data['label']  # Add emotion labels
embeddings_df['encoded_label'] = labels  # Add encoded labels

# Save the DataFrame to a CSV file for inspection
embeddings_df.to_csv('converted_dataset.csv', index=False)

# Display the first few rows of the DataFrame
print(embeddings_df.head())




  edge_attr=torch.tensor(edge_attr.clone().detach(), dtype=torch.float),


          0         1         2         3         4         5         6  \
0 -0.055051 -0.007697  0.063530 -0.039664  0.116901 -0.123296  0.058080   
1  0.009239 -0.052964  0.019263  0.034021  0.125202  0.027428  0.077058   
2 -0.074503 -0.010642 -0.003460 -0.073246 -0.018509 -0.026024  0.023560   
3  0.108594  0.095322  0.036477  0.015178  0.089073 -0.012647 -0.089686   
4 -0.016712 -0.078771  0.032170 -0.053829  0.115593 -0.051190  0.132093   

          7         8         9  ...       377       378       379       380  \
0  0.067705  0.071730 -0.109816  ...  0.021249 -0.029084  0.084679  0.016152   
1  0.035879  0.075603 -0.052699  ...  0.132352 -0.082222  0.003469  0.095559   
2  0.062387  0.110395  0.064938  ...  0.019752  0.078386 -0.010269  0.041514   
3 -0.070015  0.042590 -0.011443  ...  0.023587  0.056529  0.024166  0.103731   
4  0.037378  0.001562 -0.072058  ... -0.016146  0.007182  0.029738  0.059137   

        381       382       383  \
0  0.015425 -0.135161 -0.064534  

In [5]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class EmotionGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.5):
        super(EmotionGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)  # First GCN layer
        self.conv2 = GCNConv(hidden_dim, hidden_dim)  # Second GCN layer
        self.fc = torch.nn.Linear(hidden_dim, output_dim)  # Fully connected output layer
        self.dropout = dropout

    def forward(self, x, edge_index):
        # Graph convolution + ReLU + Dropout
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        # Second graph convolution + ReLU
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        # Output layer
        x = self.fc(x)
        return F.log_softmax(x, dim=1)  # Log probabilities for classification


In [6]:
# Define hyperparameters
input_dim = 384  # Dimension of sentence embeddings
hidden_dim = 128  # Hidden layer size
output_dim = len(torch.unique(graph_data.y))  # Number of classes (emotions)
learning_rate = 0.01
dropout = 0.5

# Initialize model, optimizer, and loss function
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmotionGNN(input_dim, hidden_dim, output_dim, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4)
criterion = torch.nn.NLLLoss()  # Negative Log-Likelihood Loss for multi-class classification


In [7]:
# Move data to the device
graph_data = graph_data.to(device)

# Training loop
def train():
    model.train()  # Set to training mode
    optimizer.zero_grad()  # Clear gradients
    out = model(graph_data.x, graph_data.edge_index)  # Forward pass
    loss = criterion(out, graph_data.y)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update model parameters
    return loss.item()

# Validation function
def validate():
    model.eval()  # Set to evaluation mode
    with torch.no_grad():
        out = model(graph_data.x, graph_data.edge_index)  # Forward pass
        val_loss = criterion(out, graph_data.y)  # Compute validation loss
        pred = out.argmax(dim=1)  # Get predictions
        accuracy = (pred == graph_data.y).sum().item() / graph_data.y.size(0)  # Compute accuracy
    return val_loss.item(), accuracy

# Run training for several epochs
epochs = 100
for epoch in range(epochs):
    loss = train()
    val_loss, val_acc = validate()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


Epoch 1/100, Loss: 1.7815, Val Loss: 1.6831, Val Acc: 0.3457
Epoch 2/100, Loss: 1.6798, Val Loss: 1.5667, Val Acc: 0.4376
Epoch 3/100, Loss: 1.5702, Val Loss: 1.5709, Val Acc: 0.5036
Epoch 4/100, Loss: 1.5805, Val Loss: 1.4705, Val Acc: 0.5086
Epoch 5/100, Loss: 1.4804, Val Loss: 1.4301, Val Acc: 0.5118
Epoch 6/100, Loss: 1.4360, Val Loss: 1.3850, Val Acc: 0.5153
Epoch 7/100, Loss: 1.3922, Val Loss: 1.3201, Val Acc: 0.5173
Epoch 8/100, Loss: 1.3292, Val Loss: 1.2651, Val Acc: 0.5196
Epoch 9/100, Loss: 1.2768, Val Loss: 1.2415, Val Acc: 0.5228
Epoch 10/100, Loss: 1.2607, Val Loss: 1.2133, Val Acc: 0.5301
Epoch 11/100, Loss: 1.2304, Val Loss: 1.1751, Val Acc: 0.5507
Epoch 12/100, Loss: 1.1950, Val Loss: 1.1461, Val Acc: 0.5887
Epoch 13/100, Loss: 1.1611, Val Loss: 1.1255, Val Acc: 0.6259
Epoch 14/100, Loss: 1.1413, Val Loss: 1.0987, Val Acc: 0.6330
Epoch 15/100, Loss: 1.1168, Val Loss: 1.0617, Val Acc: 0.6344
Epoch 16/100, Loss: 1.0857, Val Loss: 1.0334, Val Acc: 0.6350
Epoch 17/100, Los

In [8]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from torch_geometric.utils import from_scipy_sparse_matrix
from scipy.sparse import coo_matrix

#  Load the test dataset
test_data = pd.read_csv('test.txt', sep=';', header=None, names=['text', 'label'])
test_texts = test_data['text'].tolist()
test_labels = test_data['label'].tolist()

#  Generate embeddings for the test set using SentenceTransformer
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')  # Pre-trained model
test_embeddings = sentence_transformer_model.encode(test_texts)  # Generate embeddings

#  Encode labels using the LabelEncoder from training
test_encoded_labels = label_encoder.transform(test_labels)  

#  Convert embeddings and labels to PyTorch tensors
test_x = torch.tensor(test_embeddings, dtype=torch.float).to(device)  # Embeddings
test_y = torch.tensor(test_encoded_labels, dtype=torch.long).to(device)  # Encoded labels

# Rebuild the graph for the test set
# Compute cosine similarity between test embeddings
test_edge_weights = cosine_similarity(test_embeddings)

# Apply a threshold to create adjacency matrix
test_threshold = 0.7  # Adjust the threshold
test_adjacency_matrix = (test_edge_weights > test_threshold).astype(int)

# Create edge_index for the test set
test_adj_matrix = coo_matrix(test_adjacency_matrix)
test_edge_index, test_edge_attr = from_scipy_sparse_matrix(test_adj_matrix)

# Convert edge_index and edge_attr to PyTorch tensors
test_edge_index = test_edge_index.to(device)
test_edge_attr = torch.tensor(test_edge_attr, dtype=torch.float).to(device)

# Evaluate the GNN model
model.eval()  
with torch.no_grad():
    # Forward pass through the GNN
    out = model(test_x, test_edge_index)
    predictions = out.argmax(dim=1)  # Get predicted labels

# Convert predictions and true labels to CPU for sklearn compatibility
true_labels = test_y.cpu().numpy()
predicted_labels = predictions.cpu().numpy()

# Compute overall accuracy
overall_accuracy = accuracy_score(true_labels, predicted_labels)

# Compute precision, recall, F1-score, and support (per-label)
metrics = precision_recall_fscore_support(true_labels, predicted_labels, labels=range(len(label_encoder.classes_)))

# Create a detailed metrics table
detailed_metrics = pd.DataFrame({
    "Label": label_encoder.classes_,
    "Accuracy": [(true_labels[predicted_labels == i] == i).sum() / (true_labels == i).sum() for i in range(len(label_encoder.classes_))],
    "Precision": metrics[0],
    "Recall": metrics[1],
    "F1-Score": metrics[2],
    "Support": metrics[3]
})

# Add overall accuracy 
detailed_metrics = pd.concat([
    detailed_metrics,
    pd.DataFrame({
        "Label": ["Overall"],
        "Accuracy": [overall_accuracy],
        "Precision": [None],  # Not meaningful for overall
        "Recall": [None],     # Not meaningful for overall
        "F1-Score": [None],   # Not meaningful for overall
        "Support": [len(true_labels)]
    })
], ignore_index=True)

# Print the detailed table
print(detailed_metrics)

# Save metrics to a CSV file for reference
detailed_metrics.to_csv('test_metrics.csv', index=False)


      Label  Accuracy  Precision    Recall  F1-Score  Support
0     anger  0.683636   0.712121  0.683636  0.697588      275
1      fear  0.651786   0.733668  0.651786  0.690307      224
2       joy  0.834532   0.745501  0.834532  0.787508      695
3      love  0.484277   0.550000  0.484277  0.515050      159
4   sadness  0.769363   0.776042  0.769363  0.772688      581
5  surprise  0.393939   0.604651  0.393939  0.477064       66
6   Overall  0.732000        NaN       NaN       NaN     2000


  test_edge_attr = torch.tensor(test_edge_attr, dtype=torch.float).to(device)
  detailed_metrics = pd.concat([
