##### 1.Since the 'Final_Project_Group13.ipynb' model did not use early stopping, it was challenging to control the learning rate and epoch size to optimize the model. Therefore, in this model, I employed a strategy that combines dynamic learning rate adjustments with epoch number selection. I used a scheduler to automatically adjust the learning rate during the training process, and implemented an early stopping strategy to automatically determine the optimal number of epochs, thus optimizing the model performance.

##### 2.The code and strategies for this experiment were completed solely by Doucheng PAN!!!

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import torch.optim as optim

# Load data
data_path = 'wikipedia.csv'

# Read CSV file without automatically parsing the column names, allowing pandas to freely read all columns
data = pd.read_csv(data_path, header=None)

# Create a list of column names: The first four column names are fixed, the rest are dynamically generated based on the number of features
column_names = ['user_id', 'item_id', 'timestamp', 'state_label'] + [f'feature_{i}' for i in range(1, data.shape[1] - 3)]

# Assign column names to the DataFrame
data.columns = column_names
# Ensure the data is sorted by timestamp
data.sort_values('timestamp', inplace=True)

# Split the dataset
train_size = int(len(data) * 0.6) # 60% of the data for training
val_size = int(len(data) * 0.2) # 20% of the data for validation

train_data = data[:train_size]
val_data = data[train_size:train_size+val_size]
test_data = data[train_size+val_size:]

# Standardize the data
scaler = StandardScaler()
train_features = scaler.fit_transform(train_data.drop(['state_label'], axis=1))  # Fit and transform training data
val_features = scaler.transform(val_data.drop(['state_label'], axis=1))  # Transform validation data
test_features = scaler.transform(test_data.drop(['state_label'], axis=1))  # Transform test data

# Extract labels
train_labels = train_data['state_label'].values
val_labels = val_data['state_label'].values
test_labels = test_data['state_label'].values

In [16]:
# Package data into a format suitable for LSTM input
train_dataset = TensorDataset(torch.tensor(train_features, dtype=torch.float32), torch.tensor(train_labels, dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(val_features, dtype=torch.float32), torch.tensor(val_labels, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(test_features, dtype=torch.float32), torch.tensor(test_labels, dtype=torch.float32))

# Use DataLoader to handle the datasets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # Training data loader with shuffling
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)      # Validation data loader without shuffling
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)    # Test data loader without shuffling


In [17]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, device):
        """
        Initialize the LSTMModel with necessary layers and parameters.
        
        Args:
            input_dim (int): The number of input features per time step.
            hidden_dim (int): The number of features in the hidden state of the LSTM.
            num_layers (int): The number of stacked LSTM layers.
            output_dim (int): The number of output features.
            device (torch.device): The device (CPU or GPU) the model will run on.
        """
        super(LSTMModel, self).__init__()
        self.device = device  # Device where the model will be placed
        self.hidden_dim = hidden_dim  # Number of units in the LSTM hidden layer
        self.num_layers = num_layers  # Number of LSTM layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)  # The LSTM module
        self.fc = nn.Linear(hidden_dim, output_dim)  # Linear layer to map from hidden state to output
        self.to(device)  # Move the model to the specified device

    def forward(self, x):
        """
        Forward pass of the model.
        
        Args:
            x (Tensor): Input tensor containing sequences for the LSTM.
        
        Returns:
            Tensor: The output of the model after processing input tensor x.
        """
        # Initialize hidden state and cell state for LSTM with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(self.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(self.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # pass the initial hidden and cell states
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])  # Apply the linear layer to the hidden state of the last time step
        return out


In [18]:
class EarlyStopping:
    """
    EarlyStopping utility to stop training when the validation loss has not improved after
    a certain number of epochs (patience).
    """
    def __init__(self, patience=5, verbose=False):
        """
        Initializes the EarlyStopping callback.
        
        Args:
            patience (int): Number of epochs to wait after last time validation loss improved.
                            Default: 5
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
        """
        self.patience = patience  # How long to wait after last time validation loss improved.
        self.verbose = verbose  # If True, prints messages about validation loss improvement.
        self.counter = 0  # Counter to keep track of how long since last improvement
        self.best_score = None  # Best score seen in the validation data
        self.early_stop = False  # Flag to signal early stopping
        self.val_loss_min = float('inf')  # Minimum validation loss seen so far

    def __call__(self, val_loss, model):
        """
        Call method that executes the early stopping logic.
        
        Args:
            val_loss (float): Current epoch's validation loss.
            model (torch.nn.Module): The model being trained.
        """
        if self.best_score is None:
            # Set the initial 'best' to the first validation loss we see
            self.best_score = val_loss
        elif val_loss > self.best_score:
            # If the validation loss is worse, increment the counter
            self.counter += 1
            if self.counter >= self.patience:
                # If counter exceeds patience, set early stopping flag
                self.early_stop = True
        else:
            # If validation loss improves, reset counter and update best score
            self.best_score = val_loss
            self.counter = 0
            if self.verbose:
                # Optionally print a message about the improvement
                print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model...')
            # Save the best model
            torch.save(model.state_dict(), 'checkpoint_model.pth')
            self.val_loss_min = val_loss  # Update the minimum validation loss


In [19]:
# Determine the device to use based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model parameters
input_dim = train_features.shape[1]  # The number of input features per time step
hidden_dim = 128  # Number of hidden units in each LSTM layer, can be adjusted based on model complexity
num_layers = 3  # Number of LSTM layers in the model
output_dim = 1  # Output dimension for binary classification

# Instantiate the model with the specified parameters and device
model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim, device)

# Define the loss function and optimizer for training
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy loss with logits for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with a learning rate of 0.001

# Set up a learning rate scheduler that reduces the learning rate when a plateau in validation loss is detected
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5, verbose=True)
# 'min' mode means the scheduler will reduce the LR when the monitored quantity stops decreasing
# patience=5 means we wait 5 epochs with no improvement before reducing the learning rate
# factor=0.5 reduces the learning rate by multiplying it with 0.5
# verbose=True will print a message whenever the scheduler updates the learning rates


In [20]:
# Early stopping setup to prevent overfitting and stop training when the validation loss does not improve
early_stopping = EarlyStopping(patience=5, verbose=True)

num_epochs = 50  # Setting a relatively high number of epochs, early stopping will determine the actual number
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to the device
        inputs = inputs.unsqueeze(1)  # Add a sequence length dimension (LSTM expects three-dimensional input)
        optimizer.zero_grad()  # Clear gradients
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs.squeeze(), labels)  # Compute loss
        loss.backward()  # Backpropagate the error
        optimizer.step()  # Update weights

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    valid_loss = 0
    valid_preds, valid_labels = [], []
    with torch.no_grad():  # Disable gradient computation during validation
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            inputs = inputs.unsqueeze(1)  # Ensure input has sequence length dimension
            outputs = model(inputs)
            valid_preds.extend(outputs.squeeze().tolist())
            valid_labels.extend(labels.tolist())
            loss = criterion(outputs.squeeze(), labels)  # Compute loss
            valid_loss += loss.item()

    valid_loss /= len(val_loader)  # Average the validation loss
    valid_auc_score = roc_auc_score(valid_labels, valid_preds)  # Calculate AUC for validation set
    print(f'Epoch {epoch+1}, Val Loss: {valid_loss:.4f}, Val AUC: {valid_auc_score:.4f}')
    
    # Learning rate scheduler
    scheduler.step(valid_loss)  # Adjust learning rate based on validation loss

    # Early stopping check
    early_stopping(valid_loss, model)  # Check if early stopping criteria are met
    if early_stopping.early_stop:
        print("Early stopping")  # Stop training if the early stopping condition is satisfied
        break


Epoch 1, Val Loss: 0.0093, Val AUC: 0.8385
Epoch 2, Val Loss: 0.0078, Val AUC: 0.8384
Validation loss decreased (inf --> 0.007814).  Saving model...
Epoch 3, Val Loss: 0.0077, Val AUC: 0.8443
Validation loss decreased (0.007814 --> 0.007735).  Saving model...
Epoch 4, Val Loss: 0.0081, Val AUC: 0.8675
Epoch 5, Val Loss: 0.0078, Val AUC: 0.8390
Epoch 6, Val Loss: 0.0080, Val AUC: 0.8429
Epoch 7, Val Loss: 0.0084, Val AUC: 0.8431
Epoch 8, Val Loss: 0.0082, Val AUC: 0.8279
Early stopping


In [21]:
# Load the best model saved during training
model.load_state_dict(torch.load('checkpoint_model.pth'))

# Evaluate the model's performance on the test set
model.eval()  # Set the model to evaluation mode
test_preds, test_labels = [], []  # Lists to store predictions and labels
with torch.no_grad():  # Disable gradient computation
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to the device
        inputs = inputs.unsqueeze(1)  # Ensure input has sequence length dimension for LSTM
        outputs = model(inputs)  # Forward pass to get output from the model
        test_preds.extend(outputs.squeeze().tolist())  # Collect predictions
        test_labels.extend(labels.tolist())  # Collect actual labels

test_auc_score = roc_auc_score(test_labels, test_preds)  # Calculate AUC score for the test data
print(f'Test AUC: {test_auc_score:.4f}')  # Print the AUC score to evaluate model performance


Test AUC: 0.8598
