## First Model - Stacking Classifier

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.sparse import hstack
import joblib

# read data
data = pd.read_csv('data_clean.csv', encoding='latin-1')

# Label encoding the class names
encoder = LabelEncoder()
data.loc[:, 'status_encoded'] = encoder.fit_transform(data['status'])
class_names = encoder.classes_

# Adding a new feature: number of words in the stemmed data
data['stemmed word count'] = data['stemmed'].apply(len)

# Train test splitting
X_train, X_test, y_train, y_test = train_test_split(data[['statement','stemmed','stemmed word count','lemmatized']], data['status_encoded'], test_size=0.2, random_state=42,stratify = data['status_encoded'])

# TFIDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2),min_df=10, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train['stemmed'])
X_test_tfidf = tfidf.transform(X_test['stemmed'])

## COMBINE FEATURES

# Ensure 'stemmed word count' is reshaped as a 2D array and convert it to sparse format
X_train_word_count = X_train[['stemmed word count']].values
X_test_word_count = X_test[['stemmed word count']].values

# Combine TF-IDF features with the stemmed word count feature
X_train_combined = hstack([X_train_tfidf, X_train_word_count])
X_test_combined = hstack([X_test_tfidf, X_test_word_count])

## STACKING CLASSIFIER

# Initialize the individual models
log_reg = LogisticRegression(max_iter=10000, class_weight='balanced', solver='lbfgs')
linear_svc = LinearSVC(C=0.2, class_weight='balanced', dual=False, loss='squared_hinge', max_iter=1000, penalty='l1')
nb = MultinomialNB(alpha=0.1, fit_prior=True)
rf = RandomForestClassifier(class_weight='balanced', n_jobs=20, random_state=42, max_depth=30,
                            max_features='sqrt', min_samples_leaf=2, min_samples_split=5, n_estimators=500,max_samples=0.75)
xgb_clf = xgb.XGBClassifier(n_jobs=24, random_state=42, colsample_bytree=0.7, learning_rate=0.05, max_depth=4,
                            n_estimators=500, reg_alpha=1.0, reg_lambda=0.5, subsample=1.0,gamma=0.1)

# Define the base models for stacking
estimators = [
    ('log_reg', log_reg),
    ('linear_svc', linear_svc),
    ('nb', nb),
    ('rf', rf),
    ('xgb', xgb_clf)
]

# The meta-classifier is now an XGBoost classifier
meta_classifier = xgb.XGBClassifier(
    n_jobs=20,
    random_state=42,
    colsample_bytree=0.6,
    learning_rate=0.1,
    max_depth=3,  # Use a simpler model for the meta-classifier
    n_estimators=100,
    reg_alpha=0.1,
    reg_lambda=1.0,
    subsample=0.75
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the StackingClassifier
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_classifier,
    cv=cv,
    n_jobs=24
)

# Train the StackingClassifier
stacking_clf.fit(X_train_combined, y_train)

# Make predictions
y_pred = stacking_clf.predict(X_test_combined)
y_pred_train = stacking_clf.predict(X_train_combined)

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_pred_train)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(classification_report(y_test, y_pred))


## Second Model - Fine Tuning Bert Sequence Classifier

In [None]:
import torch
print("PyTorch version:", torch.__version__)
print("Is CUDA available?:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device name:", torch.cuda.get_device_name(0))

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm  # For progress bar
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import re

# read data
data = pd.read_csv('data_clean.csv', encoding='latin-1')

# Text cleaning function for BERT
def text_clean_for_bert(text):
    text = re.sub(r'\S+@\S+', '', text)  # remove emails
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'\d+', '', text)  # remove numbers
    # Remove emojis (optional)
    emoji_pattern = re.compile("[" 
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # geometric shapes extended
                               u"\U0001F800-\U0001F8FF"  # supplemental arrows
                               u"\U0001F900-\U0001F9FF"  # supplemental symbols & pictographs
                               u"\U0001FA00-\U0001FA6F"  # chess symbols
                               u"\U0001FA70-\U0001FAFF"  # symbols and pictographs extended
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"  # Enclosed characters
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # We keep punctuation for BERT, no need to tokenize manually
    return text.strip()

data['bert_clean'] = data['statement'].apply(text_clean_for_bert)

# label Encoding
encoder = LabelEncoder()
data.loc[:, 'status_encoded'] = encoder.fit_transform(data['status'])

class_names = encoder.classes_

# Train test splitting
X_train, X_test, y_train, y_test = train_test_split(data['bert_clean'], data['status_encoded'], test_size=0.2, random_state=42,stratify = data['status_encoded'])

## TOKENIZE THE DATA

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function with long labels
def tokenize_data(texts, labels, max_len=256):
    inputs = tokenizer(
        texts.tolist(),  # Convert to list if it's a pandas Series or NumPy array
        padding=True, 
        truncation=True, 
        max_length=max_len, 
        return_tensors="pt"
    )
    # Convert labels to long type
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels.values, dtype=torch.long))
    return dataset

y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Tokenize X_train and X_test
train_dataset = tokenize_data(X_train, y_train)
test_dataset = tokenize_data(X_test, y_test)

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## BERT MODEL FINE TUNING

# Define EarlyStopping class
class EarlyStopping:
    def __init__(self, patience=2, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best_val_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_val_loss is None:
            self.best_val_loss = val_loss
        elif val_loss < self.best_val_loss - self.min_delta:
            self.best_val_loss = val_loss
            self.counter = 0  # Reset the counter if validation loss improves
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True  # Stop training if patience is exceeded

# Function to evaluate model on a given dataset
def evaluate(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    total_loss, total_correct, total_samples = 0, 0, 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in data_loader:
            # Move inputs and labels to device
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            # Make predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=-1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = total_correct / total_samples
    return avg_loss, accuracy


# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=7)
model.config.hidden_dropout_prob = 0.3  # Ensure dropout is set

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set model to training mode
model.train()

# Define optimizer with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Define learning rate scheduler (ReduceLROnPlateau)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, min_lr=1e-6, verbose=True)

# Instantiate EarlyStopping
early_stopping = EarlyStopping(patience=2, min_delta=0.001)

# Lists to store loss and accuracy values for each epoch
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

# Fine-tuning loop with early stopping and learning rate scheduling
num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    total_loss, total_correct, total_samples = 0, 0, 0
    
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Append the training loss and accuracy
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy)

    # Evaluate on validation set
    val_loss, val_accuracy = evaluate(model, test_loader)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy * 100:.2f}%')
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy * 100:.2f}%')

    # Early stopping check
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    # Adjust learning rate manually after 2 epochs
    if epoch == 1:
        print("Reducing learning rate to 1e-6")
        for param_group in optimizer.param_groups:
            param_group['lr'] = 1e-6
                
    # Adjust learning rate if validation loss plateaus
    scheduler.step(val_loss)

    # Clear GPU memory
    torch.cuda.empty_cache()

# After training, evaluate final performance
train_loss, train_accuracy = evaluate(model, train_loader)
test_loss, test_accuracy = evaluate(model, test_loader)

print(f'Final Training Accuracy: {train_accuracy * 100:.2f}%')
print(f'Final Test Accuracy: {test_accuracy * 100:.2f}%')
