In [6]:
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

import pandas as pd,os
import torch
from statistics import mode
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import pickle

In [7]:
MODEL1 = 'bert-base-uncased'
MODEL2 = 'Hello-SimpleAI/chatgpt-detector-roberta'
MODEL3 = 'roberta-base-openai-detector'
MODEL4 = 'roberta-base'
MODEL5 = 'distilbert-base-uncased'
MODEL6 = 'google/electra-base-discriminator'

MODEL_PATH1='SavedModels/bert-base-uncased12k'
MODEL_PATH2='SavedModels/chatgpt-detector-roberta12k'
MODEL_PATH3='SavedModels/roberta-base-openai-detector12k'
MODEL_PATH4='SavedModels/roberta-base1k'
MODEL_PATH5='SavedModels/distilbert-base-uncased15k'
MODEL_PATH6='SavedModels/electra-base-discriminator9k'

'''Load tokenizers and models'''

tokenizer1 = AutoTokenizer.from_pretrained(MODEL1)
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH1)

tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH2)

tokenizer3 = AutoTokenizer.from_pretrained(MODEL3)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH3)

tokenizer4 = AutoTokenizer.from_pretrained(MODEL4)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH4)

tokenizer5 = AutoTokenizer.from_pretrained(MODEL5)
model5 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH5)

tokenizer6 = AutoTokenizer.from_pretrained(MODEL6)
model6 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH6)

pipe1 = pipeline("text-classification", model=model1, tokenizer=tokenizer1, device=0)
pipe2 = pipeline("text-classification", model=model2, tokenizer=tokenizer2, device=0)
pipe3 = pipeline("text-classification", model=model3, tokenizer=tokenizer3, device=0)
pipe4 = pipeline("text-classification", model=model4, tokenizer=tokenizer4, device=0)
pipe5 = pipeline("text-classification", model=model5, tokenizer=tokenizer5, device=0)
pipe6 = pipeline("text-classification", model=model6, tokenizer=tokenizer6, device=0)

In [8]:
'''Preparing data'''

df = pd.read_json('datasets/subtaskA_dev_monolingual.jsonl', lines=True)

print(f'Original dataset')
print(df.info())
print(f'''\n{df['label'].value_counts()}''')
print(f'''\n{df['model'].value_counts()}''')
print(f'''\n{df['source'].value_counts()}''')

df = df[['text', 'label']]
test_df=df

test_texts = test_df['text'].tolist()

results1 = [pipe1(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL1}\t\t\t")]
results2 = [pipe2(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL2}\t")]
results3 = [pipe3(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL3}\t\t")]
results4 = [pipe4(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL4}\t\t\t\t")]
results5 = [pipe5(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL5}\t\t\t")]
results6 = [pipe6(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL6}\t")]

Original dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
 2   model   5000 non-null   object
 3   source  5000 non-null   object
 4   id      5000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 195.4+ KB
None

label
1    2500
0    2500
Name: count, dtype: int64

model
bloomz    2500
human     2500
Name: count, dtype: int64

source
wikihow      1000
wikipedia    1000
reddit       1000
arxiv        1000
peerread     1000
Name: count, dtype: int64


Processing with bert-base-uncased			: 100%|██████████| 5000/5000 [00:42<00:00, 117.89it/s]
Processing with Hello-SimpleAI/chatgpt-detector-roberta	: 100%|██████████| 5000/5000 [00:43<00:00, 115.35it/s]
Processing with roberta-base-openai-detector		: 100%|██████████| 5000/5000 [00:43<00:00, 113.89it/s]
Processing with roberta-base				: 100%|██████████| 5000/5000 [00:42<00:00, 117.40it/s]
Processing with distilbert-base-uncased		: 100%|██████████| 5000/5000 [00:26<00:00, 191.02it/s]
Processing with google/electra-base-discriminator		: 100%|██████████| 5000/5000 [00:41<00:00, 119.16it/s]


In [9]:
def getMetrics(predicted_labels, true_labels):
    # Ensure the labels are numpy arrays
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = precision_recall_fscore_support(true_labels, predicted_labels, average='macro')[2]
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    auc = roc_auc_score(true_labels, predicted_labels)
    cm = confusion_matrix(true_labels, predicted_labels)

    # Create a dictionary of metrics
    metrics = {
        'f1': f1,
        'confusion_matrix': cm.tolist(),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'auc': auc,
    }

    return metrics

labels1 = [0 if item['label'] == 'human' else 1 for d in results1 for item in d]
scores1 = [item['score'] for d in results1 for item in d]

labels2 = [0 if item['label'] == 'human' else 1 for d in results2 for item in d]
scores2 = [item['score'] for d in results2 for item in d]

labels3 = [0 if item['label'] == 'human' else 1 for d in results3 for item in d]
scores3 = [item['score'] for d in results3 for item in d]

labels4 = [0 if item['label'] == 'human' else 1 for d in results4 for item in d]
scores4 = [item['score'] for d in results4 for item in d]

labels5 = [0 if item['label'] == 'human' else 1 for d in results5 for item in d]
scores5 = [item['score'] for d in results5 for item in d]

labels6 = [0 if item['label'] == 'human' else 1 for d in results6 for item in d]
scores6 = [item['score'] for d in results6 for item in d]

print(MODEL1)
print(getMetrics(labels1,test_df['label'].tolist()))
print(MODEL2)
print(getMetrics(labels2,test_df['label'].tolist()))
print(MODEL3)
print(getMetrics(labels3,test_df['label'].tolist()))
print(MODEL4)
print(getMetrics(labels4,test_df['label'].tolist()))
print(MODEL5)
print(getMetrics(labels5,test_df['label'].tolist()))
print(MODEL6)
print(getMetrics(labels6,test_df['label'].tolist()))

bert-base-uncased
{'f1': 0.6305998782778969, 'confusion_matrix': [[2215, 285], [1460, 1040]], 'accuracy': 0.651, 'precision': 0.7849056603773585, 'recall': 0.416, 'auc': 0.6509999999999999}
Hello-SimpleAI/chatgpt-detector-roberta
{'f1': 0.6448845285349909, 'confusion_matrix': [[2298, 202], [1461, 1039]], 'accuracy': 0.6674, 'precision': 0.8372280419016922, 'recall': 0.4156, 'auc': 0.6674}
roberta-base-openai-detector
{'f1': 0.7094461014998555, 'confusion_matrix': [[2320, 180], [1211, 1289]], 'accuracy': 0.7218, 'precision': 0.8774676650782846, 'recall': 0.5156, 'auc': 0.7218}
roberta-base
{'f1': 0.8025985097631888, 'confusion_matrix': [[2146, 354], [630, 1870]], 'accuracy': 0.8032, 'precision': 0.8408273381294964, 'recall': 0.748, 'auc': 0.8032}
distilbert-base-uncased
{'f1': 0.6843230995648157, 'confusion_matrix': [[1873, 627], [945, 1555]], 'accuracy': 0.6856, 'precision': 0.7126489459211732, 'recall': 0.622, 'auc': 0.6856}
google/electra-base-discriminator
{'f1': 0.7379069127755429,

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

models = [MODEL1, MODEL2, MODEL3, MODEL4, MODEL5, MODEL6]

df = pd.DataFrame({
    f'Labels_{MODEL1}': labels1,
    f'Scores_{MODEL1}': scores1,
    f'Labels_{MODEL2}': labels2,
    f'Scores_{MODEL2}': scores2,
    f'Labels_{MODEL3}': labels3,
    f'Scores_{MODEL3}': scores3,
    f'Labels_{MODEL4}': labels4,
    f'Scores_{MODEL4}': scores4,
    f'Labels_{MODEL5}': labels5,
    f'Scores_{MODEL5}': scores5,
    f'Labels_{MODEL6}': labels6,
    f'Scores_{MODEL6}': scores6,
})

labels = test_df['label'].tolist()

df.info()

def ensemble_methods(df, models):
    
    majority_labels = []
    score_based_labels = []
    rank_voting_labels = []
    borda_count_labels = []

    for index, row in df.iterrows():
        label_counts = {0: 0, 1: 0}
        score_sums = {0: 0.0, 1: 0.0}
        weighted_scores = {0: 0.0, 1: 0.0}

        for i, model in enumerate(models):
            label = row[f'Labels_{model}']
            score = row[f'Scores_{model}']
            label_counts[label] += 1
            score_sums[label] += score

        # Majority Voting
        majority_label = 0 if label_counts[0] > label_counts[1] else 1
        majority_labels.append(majority_label)

        # Soft Voting
        avg_score_0 = score_sums[0] / (label_counts[0] if label_counts[0] else 1)
        avg_score_1 = score_sums[1] / (label_counts[1] if label_counts[1] else 1)
        score_based_label = 0 if avg_score_0 > avg_score_1 else 1
        score_based_labels.append(score_based_label)
        
        # Rank Voting
        ranks = [row[f'Scores_{model}'] for model in models]
        ranked_labels = [label for _, label in sorted(zip(ranks, [row[f'Labels_{model}'] for model in models]))]
        rank_voting_labels.append(ranked_labels[0])  # The label with the lowest rank

        # Borda Count
        borda_scores = {0: 0, 1: 0}
        for rank, label in enumerate(ranked_labels):
            borda_scores[label] += (len(models) - rank)
        borda_count_labels.append(max(borda_scores, key=borda_scores.get))
        

    return {
        'Majority Voting':majority_labels,
        'Soft Voting':score_based_labels,
        'Rank Voting':rank_voting_labels,
        'Borda Count':borda_count_labels,
    }

ensemble_results = ensemble_methods(df, models)
    
finalScore=getMetrics(ensemble_results['Majority Voting'],labels)['f1']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Labels_bert-base-uncased                        5000 non-null   int64  
 1   Scores_bert-base-uncased                        5000 non-null   float64
 2   Labels_Hello-SimpleAI/chatgpt-detector-roberta  5000 non-null   int64  
 3   Scores_Hello-SimpleAI/chatgpt-detector-roberta  5000 non-null   float64
 4   Labels_roberta-base-openai-detector             5000 non-null   int64  
 5   Scores_roberta-base-openai-detector             5000 non-null   float64
 6   Labels_roberta-base                             5000 non-null   int64  
 7   Scores_roberta-base                             5000 non-null   float64
 8   Labels_distilbert-base-uncased                  5000 non-null   int64  
 9   Scores_distilbert-base-uncased           

0.7515386275971341

In [20]:
import itertools
# Let's define a list of all models you have
all_models = [MODEL1, MODEL2, MODEL3, MODEL4, MODEL5, MODEL6]

# This will store the best F1 score and the corresponding model combination
best_f1_score = 0
best_model_combination = None

# Try all possible combinations of the models
for r in range(1, len(all_models) + 1):
    for model_combination in itertools.combinations(all_models, r):
        # Generate the predictions using the ensemble of the current combination of models
        ensemble_results = ensemble_methods(df, model_combination)
        
        # Calculate the F1 score for majority voting (you could extend this to other methods)
        f1_score = getMetrics(ensemble_results['Majority Voting'], labels)['f1']
        
        # Update the best combination if the current one is better
        if f1_score > best_f1_score:
            best_f1_score = f1_score
            best_model_combination = model_combination

# Print the best combination and its score
print(f"Best F1 Score: {best_f1_score}")
print(f"Best Model Combination: {best_model_combination}")

Best F1 Score: 0.840331597415255
Best Model Combination: ('roberta-base', 'google/electra-base-discriminator')


In [25]:
# MODEL1 = 'bert-base-uncased'
# MODEL2 = 'Hello-SimpleAI/chatgpt-detector-roberta'
# MODEL3 = 'roberta-base-openai-detector'
# MODEL4 = 'roberta-base'
# MODEL5 = 'distilbert-base-uncased'
# MODEL6 = 'google/electra-base-discriminator'

models = [MODEL4, MODEL6]

df = pd.DataFrame({
    f'Labels_{MODEL4}': labels4,
    f'Scores_{MODEL4}': scores4,
    f'Labels_{MODEL6}': labels6,
    f'Scores_{MODEL6}': scores6,
})

ensemble_results = ensemble_methods(df, models)
    
finalScore=getMetrics(ensemble_results['Majority Voting'],labels)['f1']

for name,labelsEnsemble in ensemble_results.items():
    print(f'{name}\n{getMetrics(labels,labelsEnsemble)}')
    
finalScore

Majority Voting
{'f1': 0.840331597415255, 'confusion_matrix': [[1999, 296], [501, 2204]], 'accuracy': 0.8406, 'precision': 0.8816, 'recall': 0.8147874306839187, 'auc': 0.8429056979127655}
Soft Voting
{'f1': 0.8217486306432049, 'confusion_matrix': [[2249, 635], [251, 1865]], 'accuracy': 0.8228, 'precision': 0.746, 'recall': 0.8813799621928167, 'auc': 0.8305998285305276}
Rank Voting
{'f1': 0.7195076525735385, 'confusion_matrix': [[2103, 986], [397, 1514]], 'accuracy': 0.7234, 'precision': 0.6056, 'recall': 0.7922553636839351, 'auc': 0.7365291062511614}
Borda Count
{'f1': 0.7195076525735385, 'confusion_matrix': [[2103, 986], [397, 1514]], 'accuracy': 0.7234, 'precision': 0.6056, 'recall': 0.7922553636839351, 'auc': 0.7365291062511614}


0.840331597415255