In [1]:
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

import pandas as pd,os
import torch
from statistics import mode
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''Variables and parameters transfer learning'''

SAMPLES_TO_TRAIN = 1000

MODEL1 = 'bert-base-uncased'
MODEL2 = 'Hello-SimpleAI/chatgpt-detector-roberta'
MODEL3 = 'roberta-base-openai-detector'
MODEL4 = 'roberta-base'
MODEL5 = 'distilbert-base-uncased'
MODEL6 = 'google/electra-base-discriminator'

MODEL_PATH1='SavedModels/bert-base-uncased12k'
MODEL_PATH2='SavedModels/chatgpt-detector-roberta12k'
MODEL_PATH3='SavedModels/roberta-base-openai-detector12k'
MODEL_PATH4='SavedModels/roberta-base1k'
MODEL_PATH5='SavedModels/distilbert-base-uncased15k'
MODEL_PATH6='SavedModels/electra-base-discriminator9k'

'''Load tokenizers and models'''

tokenizer1 = AutoTokenizer.from_pretrained(MODEL1)
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH1)

tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH2)

tokenizer3 = AutoTokenizer.from_pretrained(MODEL3)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH3)

tokenizer4 = AutoTokenizer.from_pretrained(MODEL4)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH4)

tokenizer5 = AutoTokenizer.from_pretrained(MODEL5)
model5 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH5)

tokenizer6 = AutoTokenizer.from_pretrained(MODEL6)
model6 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH6)

pipe1 = pipeline("text-classification", model=model1, tokenizer=tokenizer1, device=0)
pipe2 = pipeline("text-classification", model=model2, tokenizer=tokenizer2, device=0)
pipe3 = pipeline("text-classification", model=model3, tokenizer=tokenizer3, device=0)
pipe4 = pipeline("text-classification", model=model4, tokenizer=tokenizer4, device=0)
pipe5 = pipeline("text-classification", model=model5, tokenizer=tokenizer5, device=0)
pipe6 = pipeline("text-classification", model=model6, tokenizer=tokenizer6, device=0)

In [3]:
'''Preparing data'''

df = pd.read_json('datasets/subtaskA_train_monolingual.jsonl', lines=True)
df = df[['text', 'label']]
test_df=df.sample(SAMPLES_TO_TRAIN)

test_texts = test_df['text'].tolist()

results1 = [pipe1(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL1}\t\t\t")]
results2 = [pipe2(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL2}\t")]
results3 = [pipe3(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL3}\t\t")]
results4 = [pipe4(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL4}\t\t\t\t")]
results5 = [pipe5(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL5}\t\t\t")]
results6 = [pipe6(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL6}\t")]

Processing with bert-base-uncased			: 100%|██████████| 1000/1000 [00:09<00:00, 105.96it/s]
Processing with Hello-SimpleAI/chatgpt-detector-roberta	: 100%|██████████| 1000/1000 [00:09<00:00, 107.08it/s]
Processing with roberta-base-openai-detector		: 100%|██████████| 1000/1000 [00:09<00:00, 106.69it/s]
Processing with roberta-base				: 100%|██████████| 1000/1000 [00:09<00:00, 107.16it/s]
Processing with distilbert-base-uncased		: 100%|██████████| 1000/1000 [00:05<00:00, 179.62it/s]
Processing with google/electra-base-discriminator	: 100%|██████████| 1000/1000 [00:08<00:00, 111.25it/s]


In [5]:
def getMetrics(predicted_labels, true_labels):
    # Ensure the labels are numpy arrays
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = precision_recall_fscore_support(true_labels, predicted_labels, average='macro')[2]
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    auc = roc_auc_score(true_labels, predicted_labels)
    cm = confusion_matrix(true_labels, predicted_labels)

    # Create a dictionary of metrics
    metrics = {
        'f1': f1,
        'confusion_matrix': cm.tolist(),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'auc': auc,
    }

    return metrics

labels1 = [0 if item['label'] == 'human' else 1 for d in results1 for item in d]
scores1 = [item['score'] for d in results1 for item in d]

labels2 = [0 if item['label'] == 'human' else 1 for d in results2 for item in d]
scores2 = [item['score'] for d in results2 for item in d]

labels3 = [0 if item['label'] == 'human' else 1 for d in results3 for item in d]
scores3 = [item['score'] for d in results3 for item in d]

labels4 = [0 if item['label'] == 'human' else 1 for d in results4 for item in d]
scores4 = [item['score'] for d in results4 for item in d]

labels5 = [0 if item['label'] == 'human' else 1 for d in results5 for item in d]
scores5 = [item['score'] for d in results5 for item in d]

labels6 = [0 if item['label'] == 'human' else 1 for d in results6 for item in d]
scores6 = [item['score'] for d in results6 for item in d]

print(MODEL1)
print(getMetrics(labels1,test_df['label'].tolist()))
print(MODEL2)
print(getMetrics(labels2,test_df['label'].tolist()))
print(MODEL3)
print(getMetrics(labels3,test_df['label'].tolist()))
print(MODEL4)
print(getMetrics(labels4,test_df['label'].tolist()))
print(MODEL5)
print(getMetrics(labels5,test_df['label'].tolist()))
print(MODEL6)
print(getMetrics(labels6,test_df['label'].tolist()))

bert-base-uncased
{'f1': 0.9197109594540345, 'confusion_matrix': [[490, 48], [32, 430]], 'accuracy': 0.92, 'precision': 0.899581589958159, 'recall': 0.9307359307359307, 'auc': 0.9207582999404562}
Hello-SimpleAI/chatgpt-detector-roberta
{'f1': 0.9639168644557059, 'confusion_matrix': [[506, 32], [4, 458]], 'accuracy': 0.964, 'precision': 0.9346938775510204, 'recall': 0.9913419913419913, 'auc': 0.9659312187193229}
roberta-base-openai-detector
{'f1': 0.9689253898610564, 'confusion_matrix': [[509, 29], [2, 460]], 'accuracy': 0.969, 'precision': 0.9406952965235174, 'recall': 0.9956709956709957, 'auc': 0.9708838249730443}
roberta-base
{'f1': 0.9469107569824875, 'confusion_matrix': [[494, 44], [9, 453]], 'accuracy': 0.947, 'precision': 0.9114688128772636, 'recall': 0.9805194805194806, 'auc': 0.94936754695119}
distilbert-base-uncased
{'f1': 0.8729632863897667, 'confusion_matrix': [[428, 110], [17, 445]], 'accuracy': 0.873, 'precision': 0.8018018018018018, 'recall': 0.9632034632034632, 'auc': 0.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = pd.DataFrame({
    f'Labels_{MODEL1}': labels1,
    f'Scores_{MODEL1}': scores1,
    f'Labels_{MODEL2}': labels2,
    f'Scores_{MODEL2}': scores2,
    f'Labels_{MODEL3}': labels3,
    f'Scores_{MODEL3}': scores3,
    f'Labels_{MODEL4}': labels4,
    f'Scores_{MODEL4}': scores4,
})

labels = test_df['label'].tolist()

df.info()

def compute_labels_and_scores(df, models):
    majority_labels = []
    score_based_labels = []

    for index, row in df.iterrows():
        label_counts = {0: 0, 1: 0}
        score_sums = {0: 0.0, 1: 0.0}

        # Count labels and sum scores for each label category
        for model in models:
            label = row[f'Labels_{model}']
            score = row[f'Scores_{model}']
            label_counts[label] += 1
            score_sums[label] += score

        # Check for majority label
        if label_counts[0] > label_counts[1]:
            majority_labels.append(0)
        elif label_counts[1] > label_counts[0]:
            majority_labels.append(1)
        else:  # Tiebreaker case
            # Prevent division by zero by adding a small epsilon if count is zero
            avg_score_0 = score_sums[0] / (label_counts[0] if label_counts[0] else 1)
            avg_score_1 = score_sums[1] / (label_counts[1] if label_counts[1] else 1)
            majority_labels.append(0 if avg_score_0 > avg_score_1 else 1)

        # For score-based label
        # Avoid division by zero by checking if counts are non-zero before dividing
        avg_score_0 = score_sums[0] / (label_counts[0] if label_counts[0] else 1)
        avg_score_1 = score_sums[1] / (label_counts[1] if label_counts[1] else 1)
        score_based_labels.append(0 if avg_score_0 > avg_score_1 else 1)

    return majority_labels, score_based_labels

# Assuming df is your DataFrame and models is a list of your model names
models = [MODEL1, MODEL2, MODEL3, MODEL4]
majority_labels, score_based_labels = compute_labels_and_scores(df, models)

print(getMetrics(majority_labels,labels))
print(getMetrics(score_based_labels,labels))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Labels_bert-base-uncased                        500 non-null    int64  
 1   Scores_bert-base-uncased                        500 non-null    float64
 2   Labels_Hello-SimpleAI/chatgpt-detector-roberta  500 non-null    int64  
 3   Scores_Hello-SimpleAI/chatgpt-detector-roberta  500 non-null    float64
 4   Labels_roberta-base-openai-detector             500 non-null    int64  
 5   Scores_roberta-base-openai-detector             500 non-null    float64
 6   Labels_roberta-base                             500 non-null    int64  
 7   Scores_roberta-base                             500 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 31.4 KB
{'f1': 0.9879807692307692, 'confusion_matrix': [[257, 4], [2, 237]], 'accur

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = pd.DataFrame({
    f'Labels_{MODEL1}': labels1,
    f'Scores_{MODEL1}': scores1,
    f'Labels_{MODEL2}': labels2,
    f'Scores_{MODEL2}': scores2,
    f'Labels_{MODEL3}': labels3,
    f'Scores_{MODEL3}': scores3,
    f'Labels_{MODEL4}': labels4,
    f'Scores_{MODEL4}': scores4,
})

labels = test_df['label'].tolist()

df.info()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

# 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 250
clf = RandomForestClassifier(n_estimators=250, random_state=42,min_samples_split=3,min_samples_leaf=1,max_depth=None,max_features='log2')
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

print(getMetrics(y_pred,y_test))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 8 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Labels_bert-base-uncased                        20000 non-null  int64  
 1   Scores_bert-base-uncased                        20000 non-null  float64
 2   Labels_Hello-SimpleAI/chatgpt-detector-roberta  20000 non-null  int64  
 3   Scores_Hello-SimpleAI/chatgpt-detector-roberta  20000 non-null  float64
 4   Labels_roberta-base-openai-detector             20000 non-null  int64  
 5   Scores_roberta-base-openai-detector             20000 non-null  float64
 6   Labels_roberta-base                             20000 non-null  int64  
 7   Scores_roberta-base                             20000 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 1.2 MB
(16000, 8)
(4000, 8)
{'f1': 0.9922046775078228, 'confusion_matrix': [[21

In [None]:
with open('SavedModels/'+'ensemble_randomforest_train.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split your data
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42)

# Define the parameter grid

# 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
param_grid = {
    'n_estimators': [200,250,300,350],
    'max_depth': [None],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    # Add more parameters here if you wish
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='f1_macro')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator to make predictions
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the best model with your getMetrics function
print(getMetrics(y_pred, y_test))


Fitting 3 folds for each of 96 candidates, totalling 288 fits


96 fits failed out of a total of 288.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Ghiki\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Ghiki\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\Ghiki\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Ghiki\AppData\Local\Programs\Python\Python311\Lib

Best parameters found:  {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 250}
{'f1': 0.9922046775078228, 'confusion_matrix': [[2137, 26], [5, 1832]], 'accuracy': 0.99225, 'precision': 0.9860064585575888, 'recall': 0.9972781709308656, 'auc': 0.992628914406718}
