In [1]:
import numpy as np
import torch
from statistics import mode
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL1='bert-base-uncased'
MODEL2='microsoft/deberta-large'
MODEL3='roberta-base'
MODEL4='roberta-large'
MODEL5='Hello-SimpleAI/chatgpt-detector-roberta'
MODEL6='roberta-base-openai-detector'

MODEL7='Hello-SimpleAI/chatgpt-detector-roberta'
MODEL8='Hello-SimpleAI/chatgpt-detector-roberta'

MODEL_PATH1='SavedModels/bert-base-uncased20k'
MODEL_PATH2='SavedModels/deberta-large5k'
MODEL_PATH3='SavedModels/roberta-base20k'
MODEL_PATH4='SavedModels/roberta-large5k'
MODEL_PATH5='SavedModels/chatgpt-detector-roberta5k'
MODEL_PATH6='SavedModels/roberta-base-openai-detector20k'

In [3]:
'''Load tokenizers and models'''

tokenizer1 = AutoTokenizer.from_pretrained(MODEL1)
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH1)

tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH2)

tokenizer3 = AutoTokenizer.from_pretrained(MODEL3)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH3)

tokenizer4 = AutoTokenizer.from_pretrained(MODEL4)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH4)

tokenizer5 = AutoTokenizer.from_pretrained(MODEL5)
model5 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH5)

tokenizer6 = AutoTokenizer.from_pretrained(MODEL6)
model6 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH6)

pipe1 = pipeline("text-classification", model=model1, tokenizer=tokenizer1, device=0)
pipe2 = pipeline("text-classification", model=model2, tokenizer=tokenizer2, device=0)
pipe3 = pipeline("text-classification", model=model3, tokenizer=tokenizer3, device=0)
pipe4 = pipeline("text-classification", model=model4, tokenizer=tokenizer4, device=0)
pipe5 = pipeline("text-classification", model=model5, tokenizer=tokenizer5, device=0)
pipe6 = pipeline("text-classification", model=model6, tokenizer=tokenizer6, device=0)

In [4]:
'''Custom model architectures'''
import torch
import torch.nn as nn
import torch.nn.functional as F
input_dim = 200

# number of classes (unique of y)
output_dim = 2

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.linear1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)
        
        self.linear2 = nn.Linear(512, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.5)
        
        self.linear3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.5)
        
        self.linear4 = nn.Linear(256, output_dim)
        
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        x1 = F.leaky_relu(self.linear1(input_ids))
        x1 = self.bn1(x1)
        x1 = self.dropout1(x1)
        
        x2 = F.leaky_relu(self.linear2(x1))
        x2 = self.bn2(x2)
        x2 = self.dropout2(x2)
        
        # Adding the first skip connection
        x2 += x1
        
        x3 = F.leaky_relu(self.linear3(x2))
        x3 = self.bn3(x3)
        x3 = self.dropout3(x3)
        
        x4 = self.linear4(x3)
        
        outputs = (x4,)
        if labels is not None:
            loss = self.loss(x4, labels)
            outputs = (loss,) + outputs
            
        return (outputs if len(outputs) > 1 else outputs[0])
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=100, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm1d(100)
        self.conv2 = nn.Conv1d(in_channels=100, out_channels=150, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(150)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(150 * 200, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, output_dim)

    def forward(self, x, labels=None):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        # Flatten the output for the dense layer
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(x, labels)
            return loss, x
        
        return x
class RNNModel(nn.Module):
    def __init__(self):
        super(RNNModel, self).__init__()
        
        self.lstm1 = nn.LSTM(input_dim, 512, batch_first=True)
        self.ln1 = nn.LayerNorm(512)
        self.dropout1 = nn.Dropout(0.2)
        
        self.lstm2 = nn.LSTM(512, 512, batch_first=True)
        self.ln2 = nn.LayerNorm(512)
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc = nn.Linear(512, output_dim)
        
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        # print(f"Input shape: {input_ids.shape}")
        
        x, _ = self.lstm1(input_ids)
        # print(f"After LSTM1: {x.shape}")

        x = self.ln1(x)
        x = self.dropout1(x)
        
        # print(f"Before LSTM2: {x.shape}")
        
        x, _ = self.lstm2(x)
        # print(f"After LSTM2: {x.shape}")

        x = self.ln2(x)
        x = self.dropout2(x)
        
        x = self.fc(x)
        # print(f"Output shape: {x.shape}")
        
        outputs = (x,)
        if labels is not None:
            loss = self.loss(x, labels)
            outputs = (loss,) + outputs
            
        return (outputs if len(outputs) > 1 else outputs[0])


In [5]:
'''Custom model preparation'''

CUSTOM_MODEL_NAME1='dense'
CUSTOM_MODEL_NAME2='cnn'
CUSTOM_MODEL_NAME3='lstm'
CUSTOM_MODEL_NAME4='random_forest'

CUSTOM_MODEL_NAME_PATH_1='./SavedModels/dense_0k'
CUSTOM_MODEL_NAME_PATH_2='./SavedModels/cnn_0k'
CUSTOM_MODEL_NAME_PATH_3='./SavedModels/lstm_0k'
CUSTOM_MODEL_NAME_PATH_4='./SavedModels/randomforest_0k.pkl'

custom_model1 = Network()
custom_model1.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_1+"/pytorch_model.bin"))

custom_model2 = CNN1D()
custom_model2.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_2+"/pytorch_model.bin"))

custom_model3 = RNNModel()
custom_model3.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_3+"/pytorch_model.bin"))

with open(CUSTOM_MODEL_NAME_PATH_4, 'rb') as file:
    custom_model4 = pickle.load(file)

In [6]:
'''Preparing custom data'''

with open('datasets/subtaskA_glove_train_dev_monolingual.pkl', 'rb') as f:
    loaded_datasets = pickle.load(f)

# Accessing loaded datasets
loaded_train_x = loaded_datasets['train_x']
loaded_train_y = loaded_datasets['train_y']
loaded_dev_x = loaded_datasets['dev_x']
loaded_dev_y = loaded_datasets['dev_y']

print(loaded_dev_x.shape)
print(loaded_dev_y.shape)

class Data(Dataset):
    def __init__(self, X_train, y_train):
        self.X = torch.from_numpy(X_train.astype(np.float32))
        self.y = torch.from_numpy(y_train).type(torch.LongTensor)
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return {'input_ids': self.X[index], 'labels': self.y[index]}

    def __len__(self):
        return self.len

class DataCnn(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(y, dtype=torch.long)
        self.len = len(X)

    def __getitem__(self, index):
        return {'x': self.X[index], 'label': self.y[index], 'label_ids': index}

    def __len__(self):
        return self.len

denseData=Data(loaded_dev_x, loaded_dev_y.values)
cnnData=DataCnn(loaded_dev_x, loaded_dev_y.values)

loaded_dev_y=loaded_dev_y.values

(5000, 200)
(5000,)


In [7]:
'''Getting predictions from custom models'''

from torch.utils.data import DataLoader
import torch

'''Dense'''

# Create a DataLoader
data_loader = DataLoader(dataset=denseData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model1.eval()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
custom_model1.to(device)

# Container to store predictions
all_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in data_loader:
        input_data = batch['input_ids'].to(device)  # Moving data to device
        labels = batch['labels'].to(device)  # This line is not necessary if you're only doing predictions, but move to device if you use them

        # Forward pass
        outputs = custom_model1(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_predictions.extend(preds.cpu().numpy())  # Moving predictions back to cpu before converting to numpy

predictions_dense=all_predictions

'''CNN'''
custom_model2 = custom_model2.to(device)
# Create DataLoader for the CNN data
cnn_data_loader = DataLoader(dataset=cnnData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model2.eval()

# Container to store predictions
all_cnn_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in cnn_data_loader:
        input_data = batch['x'].to(device)  # Moving data to device
        labels = batch['label'].to(device)  # Move to device if you use them, not necessary for only predictions

        # Forward pass
        outputs = custom_model2(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_cnn_predictions.extend(preds.cpu().numpy())  # Moving predictions back to CPU before converting to numpy
predictions_cnn=all_cnn_predictions

'''LSTM'''
# Create a DataLoader
data_loader = DataLoader(dataset=denseData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model3.eval()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
custom_model3.to(device)

# Container to store predictions
all_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in data_loader:
        input_data = batch['input_ids'].to(device)  # Moving data to device
        labels = batch['labels'].to(device)  # This line is not necessary if you're only doing predictions, but move to device if you use them

        # Forward pass
        outputs = custom_model3(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_predictions.extend(preds.cpu().numpy())  # Moving predictions back to cpu before converting to numpy

predictions_lstm=all_predictions

predictions_randomforest=custom_model4.predict(loaded_dev_x)

In [3]:
'''Loading data'''

import pandas as pd,os
from imblearn.under_sampling import RandomUnderSampler

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
print(df.info())
print(f'''\n{df['label'].value_counts()}''')
print(f'''\n{df['model'].value_counts()}''')
print(f'''\n{df['source'].value_counts()}''')
print(df.sample(5))

df = df[['text', 'label']]

test_df=df
SAMPLES_TO_TRAIN=test_df.shape[0]

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(test_df['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
 2   model   5000 non-null   object
 3   source  5000 non-null   object
 4   id      5000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 195.4+ KB
None

label
1    2500
0    2500
Name: count, dtype: int64

model
bloomz    2500
human     2500
Name: count, dtype: int64

source
wikihow      1000
wikipedia    1000
reddit       1000
arxiv        1000
peerread     1000
Name: count, dtype: int64
                                                   text  label   model  \
3322  We present an explicit expression for the effe...      1  bloomz   
3565    Dim radio-quiet neutron star (DRQNS) 1E 1207...      0   human   
4457  The authors present an interesting approach th...      1  bloomz   
2598  its not water that makes skin stay moist - 

In [10]:
'''Getting predictions from transfer models'''

from tqdm import tqdm

test_texts = test_df['text'].tolist()

results1 = [pipe1(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL1}")]
results2 = [pipe2(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL2}")]
results3 = [pipe3(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL3}")]
results4 = [pipe4(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL4}")]
results5 = [pipe5(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL5}")]
results6 = [pipe6(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc=f"Processing with {MODEL6}")]

labels1 = [0 if item['label'] == 'LABEL_0' else 1 for d in results1 for item in d]
scores1 = [item['score'] for d in results1 for item in d]

labels2 = [0 if item['label'] == 'LABEL_0' else 1 for d in results2 for item in d]
scores2 = [item['score'] for d in results2 for item in d]

labels3 = [0 if item['label'] == 'LABEL_0' else 1 for d in results3 for item in d]
scores3 = [item['score'] for d in results3 for item in d]

labels4 = [0 if item['label'] == 'LABEL_0' else 1 for d in results4 for item in d]
scores4 = [item['score'] for d in results4 for item in d]

labels5 = [0 if item['label'] == 'Human' else 1 for d in results5 for item in d]
scores5 = [item['score'] for d in results5 for item in d]

labels6 = [1 if item['label'] == 'Real' else 0 for d in results6 for item in d]
scores6 = [item['score'] for d in results6 for item in d]


Processing with bert-base-uncased: 100%|██████████| 5000/5000 [00:37<00:00, 134.68it/s]
Processing with microsoft/deberta-large: 100%|██████████| 5000/5000 [02:27<00:00, 33.94it/s]
Processing with roberta-base: 100%|██████████| 5000/5000 [00:39<00:00, 126.73it/s]
Processing with roberta-large: 100%|██████████| 5000/5000 [01:06<00:00, 75.34it/s]
Processing with Hello-SimpleAI/chatgpt-detector-roberta: 100%|██████████| 5000/5000 [00:39<00:00, 127.76it/s]
Processing with roberta-base-openai-detector: 100%|██████████| 5000/5000 [00:38<00:00, 130.41it/s]


In [11]:
'''Get metrics'''
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

def getMetrics(predicted_labels, true_labels):
    # Ensure the labels are numpy arrays
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='binary')
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    auc = roc_auc_score(true_labels, predicted_labels)
    cm = confusion_matrix(true_labels, predicted_labels)

    # Create a dictionary of metrics
    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc,
        'confusion_matrix': cm.tolist()  # Convert confusion matrix to a list for JSON serialization
    }

    return metrics

In [12]:
print(CUSTOM_MODEL_NAME1)
print(getMetrics(predictions_dense,test_df['label'].tolist()))
print(CUSTOM_MODEL_NAME2)
print(getMetrics(predictions_cnn,test_df['label'].tolist()))
print(CUSTOM_MODEL_NAME3)
print(getMetrics(predictions_lstm,test_df['label'].tolist()))
print(CUSTOM_MODEL_NAME4)
print(getMetrics(predictions_randomforest,test_df['label'].tolist()))

dense
{'accuracy': 0.556, 'f1': 0.36170212765957444, 'precision': 0.6431492842535788, 'recall': 0.2516, 'auc': 0.556, 'confusion_matrix': [[2151, 349], [1871, 629]]}
cnn
{'accuracy': 0.5634, 'f1': 0.3821115199547127, 'precision': 0.6534365924491772, 'recall': 0.27, 'auc': 0.5634, 'confusion_matrix': [[2142, 358], [1825, 675]]}
lstm
{'accuracy': 0.5528, 'f1': 0.36836158192090396, 'precision': 0.6269230769230769, 'recall': 0.2608, 'auc': 0.5528, 'confusion_matrix': [[2112, 388], [1848, 652]]}
random_forest
{'accuracy': 0.4932, 'f1': 0.28700056274620145, 'precision': 0.4838709677419355, 'recall': 0.204, 'auc': 0.49319999999999997, 'confusion_matrix': [[1956, 544], [1990, 510]]}


In [13]:
print(MODEL1)
print(getMetrics(labels1,test_df['label'].tolist()))
print(MODEL2)
print(getMetrics(labels2,test_df['label'].tolist()))
print(MODEL3)
print(getMetrics(labels3,test_df['label'].tolist()))
print(MODEL4)
print(getMetrics(labels4,test_df['label'].tolist()))
print(MODEL5)
print(getMetrics(labels5,test_df['label'].tolist()))
print(MODEL6)
print(getMetrics(labels6,test_df['label'].tolist()))

bert-base-uncased
{'accuracy': 0.6902, 'f1': 0.5852744310575636, 'precision': 0.8850202429149797, 'recall': 0.4372, 'auc': 0.6902, 'confusion_matrix': [[2358, 142], [1407, 1093]]}
microsoft/deberta-large
{'accuracy': 0.6868, 'f1': 0.570958904109589, 'precision': 0.9060869565217391, 'recall': 0.4168, 'auc': 0.6868000000000001, 'confusion_matrix': [[2392, 108], [1458, 1042]]}
roberta-base
{'accuracy': 0.6556, 'f1': 0.49412455934195065, 'precision': 0.9303097345132744, 'recall': 0.3364, 'auc': 0.6556, 'confusion_matrix': [[2437, 63], [1659, 841]]}
roberta-large
{'accuracy': 0.839, 'f1': 0.8466958674538183, 'precision': 0.8080697928026173, 'recall': 0.8892, 'auc': 0.839, 'confusion_matrix': [[1972, 528], [277, 2223]]}
Hello-SimpleAI/chatgpt-detector-roberta
{'accuracy': 0.6998, 'f1': 0.6106355382619973, 'precision': 0.8686346863468635, 'recall': 0.4708, 'auc': 0.6998, 'confusion_matrix': [[2322, 178], [1323, 1177]]}
roberta-base-openai-detector
{'accuracy': 0.636, 'f1': 0.44137507673419285

In [17]:
df = pd.DataFrame({
    f'Labels_{MODEL1}': labels1,
    f'Scores_{MODEL1}': scores1,
    f'Labels_{MODEL2}': labels2,
    f'Scores_{MODEL2}': scores2,
    f'Labels_{MODEL3}': labels3,
    f'Scores_{MODEL3}': scores3,
    f'Labels_{MODEL4}': labels4,
    f'Scores_{MODEL4}': scores4,
    f'Labels_{MODEL5}': labels5,
    f'Scores_{MODEL5}': scores5,
    f'Labels_{MODEL6}': labels6,
    f'Scores_{MODEL6}': scores6,
    'Prediction_dense':predictions_dense,
    'Prediction_cnn':predictions_cnn,
    'Prediction_lstm':predictions_lstm,
    'Prediction_randomforest':predictions_randomforest,
})

labels = test_df['label'].tolist()

with open('SavedModels/ensemble_randomforest_train.pkl', 'rb') as file:
        ensembleModel = pickle.load(file)

finalPrediction = ensembleModel.predict(df)

print(getMetrics(finalPrediction,labels))

{'accuracy': 0.6288, 'f1': 0.42431761786600497, 'precision': 0.9447513812154696, 'recall': 0.2736, 'auc': 0.6288, 'confusion_matrix': [[2460, 40], [1816, 684]]}


In [18]:
df

Unnamed: 0,Labels_bert-base-uncased,Scores_bert-base-uncased,Labels_microsoft/deberta-large,Scores_microsoft/deberta-large,Labels_roberta-base,Scores_roberta-base,Labels_roberta-large,Scores_roberta-large,Labels_Hello-SimpleAI/chatgpt-detector-roberta,Scores_Hello-SimpleAI/chatgpt-detector-roberta,Labels_roberta-base-openai-detector,Scores_roberta-base-openai-detector,Prediction_dense,Prediction_cnn,Prediction_lstm,Prediction_randomforest
0,1,0.986458,0,0.997726,0,0.999538,0,0.997903,1,0.916230,0,0.999977,1,1,1,0
1,1,0.990715,1,0.804012,1,0.991061,1,0.983358,1,0.749325,0,0.999977,0,0,0,0
2,1,0.990361,0,0.999279,0,0.998842,1,0.809746,1,0.996288,0,0.999977,0,0,0,0
3,0,0.947833,0,0.999961,0,0.997465,1,0.980866,0,0.998468,0,0.999977,0,1,0,0
4,1,0.766751,1,0.999913,1,0.999739,1,0.982552,0,0.998502,0,0.999977,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.996642,0,0.999983,0,0.999532,1,0.983148,0,0.998445,0,0.999977,0,0,0,0
4996,0,0.998859,0,0.999985,0,0.999540,0,0.997943,0,0.998435,0,0.999977,0,0,0,0
4997,0,0.996838,0,0.999982,0,0.999541,0,0.997883,0,0.998489,0,0.999977,0,0,0,0
4998,0,0.998399,0,0.999973,0,0.999525,1,0.982581,0,0.998454,0,0.999977,0,0,0,0


In [None]:
# majority voting
# mean nosecuanto
# 

In [24]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Assume df is your DataFrame and labels is your target array
X = df
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base learners
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=350, random_state=42, min_samples_split=3, min_samples_leaf=1, max_depth=None)),
    ('svc', SVC(probability=True)),
    ('knn', KNeighborsClassifier())
]

# Initialize Stacking Classifier with the Meta Learner
stack_clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(), cv=5)

# Fit the model on your data
stack_clf.fit(X_train, y_train)

# Now you can use stack_clf to make new predictions


In [25]:
finalPrediction = stack_clf.predict(X_test)

print(getMetrics(finalPrediction,y_test))

{'accuracy': 0.891, 'f1': 0.8913260219341974, 'precision': 0.8696498054474708, 'recall': 0.9141104294478528, 'auc': 0.8914974847826347, 'confusion_matrix': [[444, 67], [42, 447]]}


In [26]:
from sklearn.ensemble import VotingClassifier

# Define individual classifiers
clf1 = RandomForestClassifier(n_estimators=350, random_state=42, min_samples_split=3, min_samples_leaf=1, max_depth=None)
clf2 = SVC(probability=True, random_state=42)
clf3 = KNeighborsClassifier()

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', clf1),
    ('svc', clf2),
    ('knn', clf3)
], voting='soft')

# Fit model
stack_clf.fit(X_train, y_train)

# Now you can use voting_clf to make new predictions


In [27]:
finalPrediction = stack_clf.predict(X_test)

print(getMetrics(finalPrediction,y_test))

{'accuracy': 0.89, 'f1': 0.8902195608782434, 'precision': 0.8693957115009746, 'recall': 0.9120654396728016, 'auc': 0.8904749898951092, 'confusion_matrix': [[444, 67], [43, 446]]}
