In [23]:
import numpy as np
import torch
from statistics import mode
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

In [24]:
'''Variables and parameters'''

MODEL1='bert-base-uncased'
MODEL2='microsoft/deberta-large'
MODEL3='roberta-base'
MODEL4='roberta-large'

MODEL_PATH1='SavedModels/bert-base-uncased5k'
MODEL_PATH2='SavedModels/deberta-large5k'
MODEL_PATH3='SavedModels/roberta-base5k'
MODEL_PATH4='SavedModels/roberta-large5k'

In [25]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [26]:
'''Load tokenizers and models'''

tokenizer1 = AutoTokenizer.from_pretrained(MODEL1)
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH1)

tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH2)

tokenizer3 = AutoTokenizer.from_pretrained(MODEL3)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH3)

tokenizer4 = AutoTokenizer.from_pretrained(MODEL4)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH4)

pipe1 = pipeline("text-classification", model=model1, tokenizer=tokenizer1, device=0)
pipe2 = pipeline("text-classification", model=model2, tokenizer=tokenizer2, device=0)
pipe3 = pipeline("text-classification", model=model3, tokenizer=tokenizer3, device=0)
pipe4 = pipeline("text-classification", model=model4, tokenizer=tokenizer4, device=0)

In [27]:
'''Loading data'''

import pandas as pd,os
from imblearn.under_sampling import RandomUnderSampler

SAMPLES_TO_TRAIN=5000

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

# test_df=df.sample(round(SAMPLES_TO_TRAIN))

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42)

# Reset index
test_df = df.reset_index(drop=True)

# we balance the training set
print(f'Dataset size before balancing: {test_df.shape}')
counts = test_df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(test_df[['text']], test_df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {test_df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
test_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(test_df['label'].value_counts())

Dataset size before balancing: (5000, 2)
Dataset size after balancing: (5000, 1)
Entried dropped: 0

Balanced DataFrame:
label
0    2500
1    2500
Name: count, dtype: int64


In [28]:
'''Getting predictions from models'''

from tqdm import tqdm

test_texts = test_df['text'].tolist()

results1 = [pipe1(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe1")]
results2 = [pipe2(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe2")]
results3 = [pipe3(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe3")]
results4 = [pipe4(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe4")]

labels1 = [0 if item['label'] == 'LABEL_0' else 1 for d in results1 for item in d]
scores1 = [item['score'] for d in results1 for item in d]

labels2 = [0 if item['label'] == 'LABEL_0' else 1 for d in results2 for item in d]
scores2 = [item['score'] for d in results2 for item in d]

labels3 = [0 if item['label'] == 'LABEL_0' else 1 for d in results3 for item in d]
scores3 = [item['score'] for d in results3 for item in d]

labels4 = [0 if item['label'] == 'LABEL_0' else 1 for d in results4 for item in d]
scores4 = [item['score'] for d in results4 for item in d]


Processing with pipe1: 100%|██████████| 5000/5000 [00:42<00:00, 117.84it/s]
Processing with pipe2: 100%|██████████| 5000/5000 [02:18<00:00, 36.17it/s]
Processing with pipe3: 100%|██████████| 5000/5000 [00:43<00:00, 115.59it/s]
Processing with pipe4: 100%|██████████| 5000/5000 [01:20<00:00, 62.47it/s]


In [29]:
'''Get metrics'''
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def getMetrics(predicted_labels, true_labels):
    # Ensure the labels are numpy arrays
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='binary')
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    auc = roc_auc_score(true_labels, predicted_labels)

    # Create a dictionary of metrics
    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    }

    return metrics

In [30]:
print(getMetrics(labels1,test_df['label'].tolist()))
print(getMetrics(labels2,test_df['label'].tolist()))
print(getMetrics(labels3,test_df['label'].tolist()))
print(getMetrics(labels4,test_df['label'].tolist()))

{'accuracy': 0.732, 'f1': 0.6903881700554528, 'precision': 0.8172866520787746, 'recall': 0.5976, 'auc': 0.732}
{'accuracy': 0.6868, 'f1': 0.570958904109589, 'precision': 0.9060869565217391, 'recall': 0.4168, 'auc': 0.6868000000000001}
{'accuracy': 0.7944, 'f1': 0.7503642544924721, 'precision': 0.9548825710754018, 'recall': 0.618, 'auc': 0.7943999999999999}
{'accuracy': 0.839, 'f1': 0.8466958674538183, 'precision': 0.8080697928026173, 'recall': 0.8892, 'auc': 0.839}


In [31]:
'''Simple Ensemble models'''

# Assume predictions is a 2D numpy array where each column represents predictions from one model
predictions = np.array([labels1, labels2, labels3, labels4]).T

print('Majority Voting')

from scipy.stats import mode
import numpy as np

# Compute the mode along axis 1
final_predictions, _ = mode(predictions, axis=1)
final_predictions_MV = final_predictions.flatten()

print(getMetrics(final_predictions_MV,test_df['label'].tolist()))


print('\nWeighted Voting')
weights = np.array([0.4, 0.3, 0.2, 0.1])  # for example
weighted_predictions = np.average(predictions, axis=1, weights=weights)

# Round to nearest integer to get final class labels
final_predictions_WV = np.round(weighted_predictions).astype(int)
print(getMetrics(final_predictions_WV,test_df['label'].tolist()))

Majority Voting
{'accuracy': 0.7368, 'f1': 0.6556776556776557, 'precision': 0.9478063540090772, 'recall': 0.5012, 'auc': 0.7367999999999999}

Weighted Voting
{'accuracy': 0.803, 'f1': 0.7715147297610763, 'precision': 0.9182771949199338, 'recall': 0.6652, 'auc': 0.803}


In [32]:
'''optimize weights to get best metrics'''

from scipy.optimize import minimize

# This is the optimization objective function, it should return the value that you want to minimize
def objective(weights: np.ndarray, *args):
    weighted_predictions = np.average(predictions, axis=1, weights=weights)
    final_predictions = np.round(weighted_predictions).astype(int)
    
    metrics = getMetrics(final_predictions, args[0])  # args[0] is expected to be true labels
    f1_score = metrics['f1']
    
    return -f1_score  # We negate the f1_score because we want to maximize it

predictions = np.array([labels1, labels2, labels3, labels4]).T
true_labels = np.array(test_df['label'].tolist())

initial_weights = [0.25, 0.25, 0.25, 0.25]  # For example

# Weights should be bounded between 0 and 1
bounds = [(0, 1)] * len(initial_weights)

# Constraints: weights should sum to 1
constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})

# Optimize weights
result = minimize(objective, initial_weights, args=(true_labels,), 
                  method='SLSQP', bounds=bounds, constraints=constraints)

# The optimal weights are now in result.x
optimal_weights = result.x

print("Optimal weights:", optimal_weights)

# like Majority Voting

Optimal weights: [0.25 0.25 0.25 0.25]


In [33]:
'''Complex Ensemble models'''

'''Stacking'''

from sklearn.linear_model import LogisticRegression

# Training the stacking model
meta_model = LogisticRegression()
meta_model.fit(predictions, test_df['label'].tolist())

final_predictions_ST=meta_model.predict(predictions)
print(getMetrics(final_predictions_ST,test_df['label'].tolist()))

{'accuracy': 0.8636, 'f1': 0.8533333333333334, 'precision': 0.9227906976744186, 'recall': 0.7936, 'auc': 0.8636}


In [34]:
'''Complex Ensemble models'''

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

'''Random Forest'''
# Create a DataFrame
df = pd.DataFrame({
    'Labels_Model1': labels1,
    'Scores_Model1': scores1,
    'Labels_Model2': labels2,
    'Scores_Model2': scores2,
    'Labels_Model3': labels3,
    'Scores_Model3': scores3,
    'Labels_Model4': labels4,
    'Scores_Model4': scores4,
})

labels = test_df['label'].tolist()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42)

# Create a Random Forest Classifier and train it on the training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

print(getMetrics(y_pred,y_test))

{'accuracy': 0.892, 'f1': 0.8934911242603549, 'precision': 0.8813229571984436, 'recall': 0.906, 'auc': 0.8920000000000001}


In [38]:
print('''for the training dataset the best (5k samples) is random forest:\n{'accuracy': 0.9936305732484076, 'f1': 0.9936575052854123, 'precision': 0.9915611814345991, 'recall': 0.9957627118644068, 'auc': 0.9936260367832672}''')

print('''\nfor the dev dataset the best (5k samples) is random forest:\n{'accuracy': 0.892, 'f1': 0.8934911242603549, 'precision': 0.8813229571984436, 'recall': 0.906, 'auc': 0.8920000000000001}''')


for the training dataset the best (5k samples) is random forest:
{'accuracy': 0.9936305732484076, 'f1': 0.9936575052854123, 'precision': 0.9915611814345991, 'recall': 0.9957627118644068, 'auc': 0.9936260367832672}

for the dev dataset the best (5k samples) is random forest:
{'accuracy': 0.892, 'f1': 0.8934911242603549, 'precision': 0.8813229571984436, 'recall': 0.906, 'auc': 0.8920000000000001}
