In [2]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [3]:
train_dataset = 'charlie_hebdo'
test_dataset = 'sydneysiege'
time_cut =130
processor = Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning(train_dataset,\
           test_dataset, time_cut=time_cut,test_size=0.7)

processor.load_data()
processor.process_data()
train,test = processor.get_final_dataframes()

rumour
0    69
1    59
Name: count, dtype: int64


In [4]:
X_train  = train.drop(columns=['rumour'])
X_train = np.hstack([X_train.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))])


X_test  = test.drop(columns=['rumour'])
X_test = np.hstack([X_test.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))])

#X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
y_train =train['rumour']
y_test =test['rumour']

In [5]:
# Apply SMOTE to the features without embeddings
smote = SMOTE(random_state=42,sampling_strategy='minority')
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset



# Model definition
class RumorDetectionLSTM(nn.Module):
    def __init__(self, embedding_dim=100, lstm_hidden_size=32, dense_hidden_size=16):
        super(RumorDetectionLSTM, self).__init__()
        
        # LSTM for the 100-dimensional embeddings
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=lstm_hidden_size, batch_first=True)
        
        # Dense layers for other features
        self.dense1 = nn.Linear(8, 16)  # 8 non-embedding features
        self.dense2 = nn.Linear(16, dense_hidden_size)
        
        # Combine LSTM and dense features
        self.fc1 = nn.Linear(lstm_hidden_size + dense_hidden_size, 64)
        self.fc2 = nn.Linear(64, 1)
        
    def forward(self, x):
        # Separate embeddings and other features
        embeddings = x[:, -100:].unsqueeze(1)  # (batch, seq_len=1, embedding_dim)
        other_features = x[:, :8]  # First 8 features
        
        # LSTM output
        lstm_out, _ = self.lstm(embeddings)
        lstm_out = lstm_out[:, -1, :]  # Get the last LSTM output
        
        # Dense layers for other features
        dense_out = torch.relu(self.dense1(other_features))
        dense_out = torch.relu(self.dense2(dense_out))
        
        # Concatenate LSTM and dense outputs
        combined = torch.cat((lstm_out, dense_out), dim=1)
        
        # Fully connected layers for classification
        x = torch.relu(self.fc1(combined))
        x = torch.sigmoid(self.fc2(x))
        return x.squeeze()


In [7]:
# Assuming X_train, X_test, y_train, and y_test are available as numpy arrays
# Convert them to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Dataset and DataLoader
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [8]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("LSTM SMOTE Filter Node 2025-02-22 Transfer Learning sydneysiege")

<Experiment: artifact_location='/workspaces/rumour-detection-gnn/mlruns/36', creation_time=1740494227702, experiment_id='36', last_update_time=1740494227702, lifecycle_stage='active', name='LSTM SMOTE Filter Node 2025-02-22 Transfer Learning sydneysiege', tags={}>

In [9]:
from sklearn.metrics import recall_score, precision_score

# Model, criterion, optimizer initialization (as before)
model = RumorDetectionLSTM()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with loss and recall monitoring
epochs = 100  # Adjust as needed
train_recall_interval = 50  # Calculate train recall every 10 epochs
loss_interval = 50  # Print loss every 10 epochs




In [10]:
for time_cut in range(15,24*3*60,15):
    print(time_cut)
    
    train_dataset = 'charlie_hebdo'
    test_dataset = 'sydneysiege'
    processor = Load_Rumours_Dataset_filtering_since_first_post_Transfer_Learning(train_dataset,\
               test_dataset, time_cut=time_cut,test_size=0.7)
    
    processor.load_data()
    processor.process_data()
    train,test = processor.get_final_dataframes()

    X_train  = train.drop(columns=['rumour'])
    X_train = np.hstack([X_train.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_train.embeddings_avg.tolist()))])
    
    
    X_test  = test.drop(columns=['rumour'])
    X_test = np.hstack([X_test.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X_test.embeddings_avg.tolist()))])
    
    #X = np.hstack([X.drop(columns=['embeddings_avg']).values, np.array(pd.DataFrame(X.embeddings_avg.tolist()))])
    y_train =train['rumour']
    y_test =test['rumour']

    # Apply SMOTE to the features without embeddings
    smote = SMOTE(random_state=42,sampling_strategy='minority')
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    # Assuming X_train, X_test, y_train, and y_test are available as numpy arrays
    # Convert them to PyTorch tensors
    X_train = torch.tensor(X_resampled, dtype=torch.float32)
    y_train = torch.tensor(y_resampled, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    
    # Dataset and DataLoader
    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    

    with mlflow.start_run():

        # Model, criterion, optimizer initialization (as before)
        model = RumorDetectionLSTM()
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        # Training loop with loss and recall monitoring
        epochs = 200  # Adjust as needed
        train_recall_interval = 50  # Calculate train recall every 10 epochs
        loss_interval = 50  # Print loss every 10 epochs
        
        for epoch in range(epochs):
            model.train()
            epoch_loss = 0
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                output = model(X_batch)
                loss = criterion(output, y_batch)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()
        
            # Print loss every 10 epochs
            if (epoch + 1) % loss_interval == 0:
                model.eval()
                train_preds = []
                train_labels = []
                with torch.no_grad():
                    for X_batch, y_batch in train_loader:
                        output = model(X_batch)
                        preds = (output >= 0.5).int()  # Binarize predictions
                        train_preds.extend(preds.tolist())
                        train_labels.extend(y_batch.tolist())
                
                train_recall = recall_score(train_labels, train_preds)
                
                
            
        
        
        # Final evaluation on test set with recall and precision
        model.eval()
        test_preds = []
        test_labels = []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                output = model(X_batch)
                preds = (output >= 0.5).int()  # Binarize predictions
                test_preds.extend(preds.tolist())
                test_labels.extend(y_batch.tolist())
        
        # Calculate final test recall and precision
        test_recall = recall_score(test_labels, test_preds)
        test_precision = precision_score(test_labels, test_preds)

        mlflow.log_metric("test_precision",  test_precision)
        mlflow.log_metric("test_recall",  test_recall)
        
                    
        mlflow.log_param("learning_rate", 0.001)
        mlflow.log_param("epochs", 100)
        mlflow.log_metric("time_cut", time_cut)

        

15
rumour
0    8
1    5
Name: count, dtype: int64
30
rumour
0    20
1    12
Name: count, dtype: int64


KeyboardInterrupt: 