In [1]:
import numpy as np
import torch
from statistics import mode
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
'''Variables and parameters transfer learning'''

SAMPLES_TO_TRAIN = 20000

MODEL1='bert-base-uncased'
MODEL2='microsoft/deberta-large'
MODEL3='roberta-base'
MODEL4='roberta-large'
MODEL5='Hello-SimpleAI/chatgpt-detector-roberta'
MODEL6='roberta-base-openai-detector'

MODEL7='Hello-SimpleAI/chatgpt-detector-roberta'
MODEL8='Hello-SimpleAI/chatgpt-detector-roberta'

MODEL_PATH1='SavedModels/bert-base-uncased20k'
MODEL_PATH2='SavedModels/deberta-large5k'
MODEL_PATH3='SavedModels/roberta-base20k'
MODEL_PATH4='SavedModels/roberta-large5k'
MODEL_PATH5='SavedModels/chatgpt-detector-roberta5k'
MODEL_PATH6='SavedModels/roberta-base-openai-detector20k'

In [18]:
'''Load tokenizers and models'''

tokenizer1 = AutoTokenizer.from_pretrained(MODEL1)
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH1)

tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH2)

tokenizer3 = AutoTokenizer.from_pretrained(MODEL3)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH3)

tokenizer4 = AutoTokenizer.from_pretrained(MODEL4)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH4)

tokenizer5 = AutoTokenizer.from_pretrained(MODEL5)
model5 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH5)

tokenizer6 = AutoTokenizer.from_pretrained(MODEL6)
model6 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH6)

pipe1 = pipeline("text-classification", model=model1, tokenizer=tokenizer1, device=0)
pipe2 = pipeline("text-classification", model=model2, tokenizer=tokenizer2, device=0)
pipe3 = pipeline("text-classification", model=model3, tokenizer=tokenizer3, device=0)
pipe4 = pipeline("text-classification", model=model4, tokenizer=tokenizer4, device=0)
pipe5 = pipeline("text-classification", model=model5, tokenizer=tokenizer5, device=0)
pipe6 = pipeline("text-classification", model=model6, tokenizer=tokenizer6, device=0)

In [19]:
'''Custom model architectures'''
import torch
import torch.nn as nn
import torch.nn.functional as F
input_dim = 200

# number of classes (unique of y)
output_dim = 2

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.linear1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)
        
        self.linear2 = nn.Linear(512, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.5)
        
        self.linear3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.5)
        
        self.linear4 = nn.Linear(256, output_dim)
        
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        x1 = F.leaky_relu(self.linear1(input_ids))
        x1 = self.bn1(x1)
        x1 = self.dropout1(x1)
        
        x2 = F.leaky_relu(self.linear2(x1))
        x2 = self.bn2(x2)
        x2 = self.dropout2(x2)
        
        # Adding the first skip connection
        x2 += x1
        
        x3 = F.leaky_relu(self.linear3(x2))
        x3 = self.bn3(x3)
        x3 = self.dropout3(x3)
        
        x4 = self.linear4(x3)
        
        outputs = (x4,)
        if labels is not None:
            loss = self.loss(x4, labels)
            outputs = (loss,) + outputs
            
        return (outputs if len(outputs) > 1 else outputs[0])
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=100, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm1d(100)
        self.conv2 = nn.Conv1d(in_channels=100, out_channels=150, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(150)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(150 * 200, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, output_dim)

    def forward(self, x, labels=None):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        # Flatten the output for the dense layer
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(x, labels)
            return loss, x
        
        return x
class RNNModel(nn.Module):
    def __init__(self):
        super(RNNModel, self).__init__()
        
        self.lstm1 = nn.LSTM(input_dim, 512, batch_first=True)
        self.ln1 = nn.LayerNorm(512)
        self.dropout1 = nn.Dropout(0.2)
        
        self.lstm2 = nn.LSTM(512, 512, batch_first=True)
        self.ln2 = nn.LayerNorm(512)
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc = nn.Linear(512, output_dim)
        
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        # print(f"Input shape: {input_ids.shape}")
        
        x, _ = self.lstm1(input_ids)
        # print(f"After LSTM1: {x.shape}")

        x = self.ln1(x)
        x = self.dropout1(x)
        
        # print(f"Before LSTM2: {x.shape}")
        
        x, _ = self.lstm2(x)
        # print(f"After LSTM2: {x.shape}")

        x = self.ln2(x)
        x = self.dropout2(x)
        
        x = self.fc(x)
        # print(f"Output shape: {x.shape}")
        
        outputs = (x,)
        if labels is not None:
            loss = self.loss(x, labels)
            outputs = (loss,) + outputs
            
        return (outputs if len(outputs) > 1 else outputs[0])


In [20]:
'''Custom model preparation'''

CUSTOM_MODEL_NAME1='dense'
CUSTOM_MODEL_NAME2='cnn'
CUSTOM_MODEL_NAME3='lstm'
CUSTOM_MODEL_NAME4='random_forest'

CUSTOM_MODEL_NAME_PATH_1='./SavedModels/dense_0k'
CUSTOM_MODEL_NAME_PATH_2='./SavedModels/cnn_0k'
CUSTOM_MODEL_NAME_PATH_3='./SavedModels/lstm_0k'
CUSTOM_MODEL_NAME_PATH_4='./SavedModels/randomforest_0k.pkl'

custom_model1 = Network()
custom_model1.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_1+"/pytorch_model.bin"))

custom_model2 = CNN1D()
custom_model2.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_2+"/pytorch_model.bin"))

custom_model3 = RNNModel()
custom_model3.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_3+"/pytorch_model.bin"))

with open(CUSTOM_MODEL_NAME_PATH_4, 'rb') as file:
    custom_model4 = pickle.load(file)

In [21]:
'''Preparing custom data'''

with open('datasets/subtaskA_glove_train_dev_monolingual.pkl', 'rb') as f:
    loaded_datasets = pickle.load(f)

# Accessing loaded datasets
loaded_train_x = loaded_datasets['train_x']
loaded_train_y = loaded_datasets['train_y']
# loaded_dev_x = loaded_datasets['dev_x']
# loaded_dev_y = loaded_datasets['dev_y']

loaded_dev_x = loaded_train_x
loaded_dev_y = loaded_train_y

random_indices = np.random.choice(loaded_dev_x.shape[0], size=SAMPLES_TO_TRAIN, replace=False)

# Extract the samples from both the array and the series using the same indices
loaded_dev_x = loaded_dev_x[random_indices, :]
loaded_dev_y = loaded_dev_y.iloc[random_indices]

print(loaded_dev_x.shape)
print(loaded_dev_y.shape)

class Data(Dataset):
    def __init__(self, X_train, y_train):
        self.X = torch.from_numpy(X_train.astype(np.float32))
        self.y = torch.from_numpy(y_train).type(torch.LongTensor)
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return {'input_ids': self.X[index], 'labels': self.y[index]}

    def __len__(self):
        return self.len

class DataCnn(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(y, dtype=torch.long)
        self.len = len(X)

    def __getitem__(self, index):
        return {'x': self.X[index], 'label': self.y[index], 'label_ids': index}

    def __len__(self):
        return self.len

denseData=Data(loaded_dev_x, loaded_dev_y.values)
cnnData=DataCnn(loaded_dev_x, loaded_dev_y.values)


loaded_dev_y=loaded_dev_y.values

(20000, 200)
(20000,)


In [22]:
'''Getting predictions from custom models'''

from torch.utils.data import DataLoader
import torch

'''Dense'''

# Create a DataLoader
data_loader = DataLoader(dataset=denseData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model1.eval()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
custom_model1.to(device)

# Container to store predictions
all_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in data_loader:
        input_data = batch['input_ids'].to(device)  # Moving data to device
        labels = batch['labels'].to(device)  # This line is not necessary if you're only doing predictions, but move to device if you use them

        # Forward pass
        outputs = custom_model1(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_predictions.extend(preds.cpu().numpy())  # Moving predictions back to cpu before converting to numpy

predictions_dense=all_predictions

'''CNN'''
custom_model2 = custom_model2.to(device)
# Create DataLoader for the CNN data
cnn_data_loader = DataLoader(dataset=cnnData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model2.eval()

# Container to store predictions
all_cnn_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in cnn_data_loader:
        input_data = batch['x'].to(device)  # Moving data to device
        labels = batch['label'].to(device)  # Move to device if you use them, not necessary for only predictions

        # Forward pass
        outputs = custom_model2(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_cnn_predictions.extend(preds.cpu().numpy())  # Moving predictions back to CPU before converting to numpy
predictions_cnn=all_cnn_predictions

'''LSTM'''
# Create a DataLoader
data_loader = DataLoader(dataset=denseData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model3.eval()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
custom_model3.to(device)

# Container to store predictions
all_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in data_loader:
        input_data = batch['input_ids'].to(device)  # Moving data to device
        labels = batch['labels'].to(device)  # This line is not necessary if you're only doing predictions, but move to device if you use them

        # Forward pass
        outputs = custom_model3(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_predictions.extend(preds.cpu().numpy())  # Moving predictions back to cpu before converting to numpy

predictions_lstm=all_predictions

predictions_randomforest=custom_model4.predict(loaded_dev_x)

In [23]:
'''Loading data'''

import pandas as pd,os
from imblearn.under_sampling import RandomUnderSampler

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

test_df=df.iloc[random_indices]

SAMPLES_TO_TRAIN=test_df.shape[0]

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(test_df['label'].value_counts())


Balanced DataFrame:
label
0    10587
1     9413
Name: count, dtype: int64


In [28]:
'''Getting predictions from transfer models'''

from tqdm import tqdm

test_texts = test_df['text'].tolist()

results1 = [pipe1(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL1}")]
results2 = [pipe2(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL2}")]
results3 = [pipe3(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL3}")]
results4 = [pipe4(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL4}")]
results5 = [pipe5(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL5}")]
results6 = [pipe6(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL6}")]

labels1 = [0 if item['label'] == 'LABEL_0' else 1 for d in results1 for item in d]
scores1 = [item['score'] for d in results1 for item in d]

labels2 = [0 if item['label'] == 'LABEL_0' else 1 for d in results2 for item in d]
scores2 = [item['score'] for d in results2 for item in d]

labels3 = [0 if item['label'] == 'LABEL_0' else 1 for d in results3 for item in d]
scores3 = [item['score'] for d in results3 for item in d]

labels4 = [0 if item['label'] == 'LABEL_0' else 1 for d in results4 for item in d]
scores4 = [item['score'] for d in results4 for item in d]

labels5 = [0 if item['label'] == 'Human' else 1 for d in results5 for item in d]
scores5 = [item['score'] for d in results5 for item in d]

labels6 = [1 if item['label'] == 'Real' else 0 for d in results6 for item in d]
scores6 = [item['score'] for d in results6 for item in d]


In [25]:
'''Get metrics'''
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

def getMetrics(predicted_labels, true_labels):
    # Ensure the labels are numpy arrays
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='binary')
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    auc = roc_auc_score(true_labels, predicted_labels)
    cm = confusion_matrix(true_labels, predicted_labels)

    # Create a dictionary of metrics
    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc,
        'confusion_matrix': cm.tolist()  # Convert confusion matrix to a list for JSON serialization
    }

    return metrics

In [26]:
print(CUSTOM_MODEL_NAME1)
print(getMetrics(predictions_dense,test_df['label'].tolist()))
print(CUSTOM_MODEL_NAME2)
print(getMetrics(predictions_cnn,test_df['label'].tolist()))
print(CUSTOM_MODEL_NAME3)
print(getMetrics(predictions_lstm,test_df['label'].tolist()))
print(CUSTOM_MODEL_NAME4)
print(getMetrics(predictions_randomforest,test_df['label'].tolist()))

dense
{'accuracy': 0.9052, 'f1': 0.8964952505732067, 'precision': 0.9220662549129702, 'recall': 0.8723042600658664, 'auc': 0.9033760839386666, 'confusion_matrix': [[9893, 694], [1202, 8211]]}
cnn
{'accuracy': 0.8907, 'f1': 0.8807419530823786, 'precision': 0.9052371873948637, 'recall': 0.8575374482099224, 'auc': 0.8888612904599248, 'confusion_matrix': [[9742, 845], [1341, 8072]]}
lstm
{'accuracy': 0.8852, 'f1': 0.8754205100379816, 'precision': 0.8946434512587335, 'recall': 0.8570062679273346, 'auc': 0.8836367884455791, 'confusion_matrix': [[9637, 950], [1346, 8067]]}
random_forest
{'accuracy': 0.9589, 'f1': 0.9556873315363882, 'precision': 0.9701214840757361, 'recall': 0.9416764049718475, 'auc': 0.9579450316159889, 'confusion_matrix': [[10314, 273], [549, 8864]]}


In [29]:
print(MODEL1)
print(getMetrics(labels1,test_df['label'].tolist()))
print(MODEL2)
print(getMetrics(labels2,test_df['label'].tolist()))
print(MODEL3)
print(getMetrics(labels3,test_df['label'].tolist()))
print(MODEL4)
print(getMetrics(labels4,test_df['label'].tolist()))
print(MODEL5)
print(getMetrics(labels5,test_df['label'].tolist()))
print(MODEL6)
print(getMetrics(labels6,test_df['label'].tolist()))

bert-base-uncased
{'accuracy': 0.95495, 'f1': 0.9521330287414332, 'precision': 0.9522848034006376, 'recall': 0.9519813024540529, 'auc': 0.9547853995032143, 'confusion_matrix': [[10138, 449], [452, 8961]]}
microsoft/deberta-large
{'accuracy': 0.9884, 'f1': 0.9877559636900992, 'precision': 0.9814368117461982, 'recall': 0.994157016891533, 'auc': 0.9887191998597648, 'confusion_matrix': [[10410, 177], [55, 9358]]}
roberta-base
{'accuracy': 0.9909, 'f1': 0.9903744446795009, 'precision': 0.9860979462875198, 'recall': 0.9946881971741209, 'auc': 0.9911100379466525, 'confusion_matrix': [[10455, 132], [50, 9363]]}
roberta-large
{'accuracy': 0.92785, 'f1': 0.9273889196397122, 'precision': 0.8809751434034416, 'recall': 0.9789652608095187, 'auc': 0.9306841039100018, 'confusion_matrix': [[9342, 1245], [198, 9215]]}
Hello-SimpleAI/chatgpt-detector-roberta
{'accuracy': 0.9785, 'f1': 0.9773708030733607, 'precision': 0.9684012931483992, 'recall': 0.9865080208222671, 'auc': 0.9789440075774697, 'confusion_

In [33]:
'''Complex Ensemble models'''

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

'''Random Forest'''
# Create a DataFrame
df = pd.DataFrame({
    f'Labels_{MODEL1}': labels1,
    f'Scores_{MODEL1}': scores1,
    f'Labels_{MODEL2}': labels2,
    f'Scores_{MODEL2}': scores2,
    f'Labels_{MODEL3}': labels3,
    f'Scores_{MODEL3}': scores3,
    f'Labels_{MODEL4}': labels4,
    f'Scores_{MODEL4}': scores4,
    f'Labels_{MODEL5}': labels5,
    f'Scores_{MODEL5}': scores5,
    f'Labels_{MODEL6}': labels6,
    f'Scores_{MODEL6}': scores6,
    'Prediction_dense':predictions_dense,
    'Prediction_cnn':predictions_cnn,
    'Prediction_lstm':predictions_lstm,
    'Prediction_randomforest':predictions_randomforest,
})

labels = test_df['label'].tolist()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

# Create a Random Forest Classifier and train it on the training data
clf = RandomForestClassifier(n_estimators=350, random_state=42,min_samples_split=3,min_samples_leaf=1,max_depth=None)
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

print(getMetrics(y_pred,y_test))

(16000, 16)
(4000, 16)
{'accuracy': 0.997, 'f1': 0.9968102073365231, 'precision': 0.9973404255319149, 'recall': 0.9962805526036131, 'auc': 0.9969599174727225, 'confusion_matrix': [[2113, 5], [7, 1875]]}


In [34]:
finalPrediction = clf.predict(df)
print(getMetrics(finalPrediction,labels))

{'accuracy': 0.9994, 'f1': 0.9993625159371017, 'precision': 0.9994687068324302, 'recall': 0.9992563476043769, 'auc': 0.9993920351415669, 'confusion_matrix': [[10582, 5], [7, 9406]]}


In [35]:
with open('SavedModels/'+'ensemble_randomforest_train.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)