In [1]:
import numpy as np
import torch
from statistics import mode
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''Variables and parameters transfer learning'''

MODEL1='bert-base-uncased'
MODEL2='microsoft/deberta-large'
MODEL3='roberta-base'
MODEL4='roberta-large'
MODEL5='Hello-SimpleAI/chatgpt-detector-roberta'
MODEL6='roberta-base-openai-detector'

MODEL_PATH1='SavedModels/bert-base-uncased5k'
MODEL_PATH2='SavedModels/deberta-large5k'
MODEL_PATH3='SavedModels/roberta-base5k'
MODEL_PATH4='SavedModels/roberta-large5k'
MODEL_PATH5='SavedModels/chatgpt-detector-roberta5k'
MODEL_PATH6='SavedModels/roberta-base-openai-detector5k'

In [3]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [4]:
'''Load tokenizers and models'''

tokenizer1 = AutoTokenizer.from_pretrained(MODEL1)
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH1)

tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH2)

tokenizer3 = AutoTokenizer.from_pretrained(MODEL3)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH3)

tokenizer4 = AutoTokenizer.from_pretrained(MODEL4)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH4)

tokenizer5 = AutoTokenizer.from_pretrained(MODEL5)
model5 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH5)

tokenizer6 = AutoTokenizer.from_pretrained(MODEL6)
model6 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH6)

pipe1 = pipeline("text-classification", model=model1, tokenizer=tokenizer1, device=0)
pipe2 = pipeline("text-classification", model=model2, tokenizer=tokenizer2, device=0)
pipe3 = pipeline("text-classification", model=model3, tokenizer=tokenizer3, device=0)
pipe4 = pipeline("text-classification", model=model4, tokenizer=tokenizer4, device=0)
pipe5 = pipeline("text-classification", model=model5, tokenizer=tokenizer5, device=0)
pipe6 = pipeline("text-classification", model=model6, tokenizer=tokenizer6, device=0)

In [24]:

import torch
import torch.nn as nn
import torch.nn.functional as F
input_dim = 200

# number of classes (unique of y)
output_dim = 2

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.linear1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)
        
        self.linear2 = nn.Linear(512, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.5)
        
        self.linear3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.5)
        
        self.linear4 = nn.Linear(256, output_dim)
        
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        x1 = F.leaky_relu(self.linear1(input_ids))
        x1 = self.bn1(x1)
        x1 = self.dropout1(x1)
        
        x2 = F.leaky_relu(self.linear2(x1))
        x2 = self.bn2(x2)
        x2 = self.dropout2(x2)
        
        # Adding the first skip connection
        x2 += x1
        
        x3 = F.leaky_relu(self.linear3(x2))
        x3 = self.bn3(x3)
        x3 = self.dropout3(x3)
        
        x4 = self.linear4(x3)
        
        outputs = (x4,)
        if labels is not None:
            loss = self.loss(x4, labels)
            outputs = (loss,) + outputs
            
        return (outputs if len(outputs) > 1 else outputs[0])
class CNN1D(nn.Module):
    def __init__(self):
        super(CNN1D, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=100, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm1d(100)
        self.conv2 = nn.Conv1d(in_channels=100, out_channels=150, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(150)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(150 * 200, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, output_dim)

    def forward(self, x, labels=None):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        # Flatten the output for the dense layer
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(x, labels)
            return loss, x
        
        return x
class RNNModel(nn.Module):
    def __init__(self):
        super(RNNModel, self).__init__()
        
        self.lstm1 = nn.LSTM(input_dim, 512, batch_first=True)
        self.ln1 = nn.LayerNorm(512)
        self.dropout1 = nn.Dropout(0.2)
        
        self.lstm2 = nn.LSTM(512, 512, batch_first=True)
        self.ln2 = nn.LayerNorm(512)
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc = nn.Linear(512, output_dim)
        
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        # print(f"Input shape: {input_ids.shape}")
        
        x, _ = self.lstm1(input_ids)
        # print(f"After LSTM1: {x.shape}")

        x = self.ln1(x)
        x = self.dropout1(x)
        
        # print(f"Before LSTM2: {x.shape}")
        
        x, _ = self.lstm2(x)
        # print(f"After LSTM2: {x.shape}")

        x = self.ln2(x)
        x = self.dropout2(x)
        
        x = self.fc(x)
        # print(f"Output shape: {x.shape}")
        
        outputs = (x,)
        if labels is not None:
            loss = self.loss(x, labels)
            outputs = (loss,) + outputs
            
        return (outputs if len(outputs) > 1 else outputs[0])


In [25]:
'''Custom model preparation'''

CUSTOM_MODEL_NAME1='dense'
CUSTOM_MODEL_NAME2='cnn'
CUSTOM_MODEL_NAME3='lstm'

CUSTOM_MODEL_NAME_PATH_1='./SavedModels/dense_0k'
CUSTOM_MODEL_NAME_PATH_2='./SavedModels/cnn_0k'
CUSTOM_MODEL_NAME_PATH_3='./SavedModels/lstm_0k'

custom_model1 = Network()
custom_model1.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_1+"/pytorch_model.bin"))

custom_model2 = CNN1D()
custom_model2.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_2+"/pytorch_model.bin"))

custom_model3 = RNNModel()
custom_model3.load_state_dict(torch.load(CUSTOM_MODEL_NAME_PATH_3+"/pytorch_model.bin"))


<All keys matched successfully>

In [53]:
with open('datasets/subtaskA_glove_train_dev_monolingual.pkl', 'rb') as f:
    loaded_datasets = pickle.load(f)

# Accessing loaded datasets
loaded_train_x = loaded_datasets['train_x']
loaded_train_y = loaded_datasets['train_y']
loaded_dev_x = loaded_datasets['dev_x']
loaded_dev_y = loaded_datasets['dev_y']

print(loaded_dev_x.shape)
print(loaded_dev_y.shape)

class Data(Dataset):
    def __init__(self, X_train, y_train):
        self.X = torch.from_numpy(X_train.astype(np.float32))
        self.y = torch.from_numpy(y_train).type(torch.LongTensor)
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return {'input_ids': self.X[index], 'labels': self.y[index]}

    def __len__(self):
        return self.len

class DataCnn(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(y, dtype=torch.long)
        self.len = len(X)

    def __getitem__(self, index):
        return {'x': self.X[index], 'label': self.y[index], 'label_ids': index}

    def __len__(self):
        return self.len

denseData=Data(loaded_dev_x, loaded_dev_y.values)
cnnData=DataCnn(loaded_dev_x, loaded_dev_y.values)


loaded_dev_y=loaded_dev_y.values

(5000, 200)
(5000,)


In [48]:
from torch.utils.data import DataLoader
import torch

# Create a DataLoader
data_loader = DataLoader(dataset=denseData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model1.eval()

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
custom_model1.to(device)

# Container to store predictions
all_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in data_loader:
        input_data = batch['input_ids'].to(device)  # Moving data to device
        labels = batch['labels'].to(device)  # This line is not necessary if you're only doing predictions, but move to device if you use them

        # Forward pass
        outputs = custom_model1(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_predictions.extend(preds.cpu().numpy())  # Moving predictions back to cpu before converting to numpy

predictions_dense=all_predictions


[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 

In [55]:
custom_model2 = custom_model2.to(device)
# Create DataLoader for the CNN data
cnn_data_loader = DataLoader(dataset=cnnData, batch_size=32, shuffle=False)  # Adjust batch size as needed

# Put the model in evaluation mode
custom_model2.eval()

# Container to store predictions
all_cnn_predictions = []

# Iterate over the DataLoader to get predictions
with torch.no_grad():
    for batch in cnn_data_loader:
        input_data = batch['x'].to(device)  # Moving data to device
        labels = batch['label'].to(device)  # Move to device if you use them, not necessary for only predictions

        # Forward pass
        outputs = custom_model2(input_data)

        # Get the predicted labels (assuming a classification task here)
        _, preds = torch.max(outputs, dim=1)

        # Store predictions
        all_cnn_predictions.extend(preds.cpu().numpy())  # Moving predictions back to CPU before converting to numpy
predictions_cnn=all_cnn_predictions

[1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 

In [36]:
with torch.no_grad():  # Disable gradient calculation
    predictions = custom_model1(torch.tensor(loaded_dev_x).float())


In [38]:

probabilities = torch.sigmoid(predictions)
print(probabilities)
print((probabilities > 0.5).int())

tensor([[0.1827, 0.7790],
        [0.3337, 0.8130],
        [0.6075, 0.3250],
        ...,
        [0.5566, 0.7226],
        [0.8483, 0.0942],
        [0.7429, 0.2177]])
tensor([[0, 1],
        [0, 1],
        [1, 0],
        ...,
        [1, 1],
        [1, 0],
        [1, 0]], dtype=torch.int32)


In [29]:
'''Loading data'''

import pandas as pd,os
from imblearn.under_sampling import RandomUnderSampler

SAMPLES_TO_TRAIN=5000

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

test_df=df

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(test_df['label'].value_counts())


Balanced DataFrame:
label
1    2500
0    2500
Name: count, dtype: int64


In [6]:
'''Getting predictions from transfer models'''

from tqdm import tqdm

test_texts = test_df['text'].tolist()

results1 = [pipe1(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe1")]
results2 = [pipe2(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe2")]
results3 = [pipe3(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe3")]
results4 = [pipe4(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe4")]
results5 = [pipe5(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe5")]
results6 = [pipe6(text, truncation=True, max_length=256) for text in tqdm(test_texts, desc="Processing with pipe6")]

labels1 = [0 if item['label'] == 'LABEL_0' else 1 for d in results1 for item in d]
scores1 = [item['score'] for d in results1 for item in d]

labels2 = [0 if item['label'] == 'LABEL_0' else 1 for d in results2 for item in d]
scores2 = [item['score'] for d in results2 for item in d]

labels3 = [0 if item['label'] == 'LABEL_0' else 1 for d in results3 for item in d]
scores3 = [item['score'] for d in results3 for item in d]

labels4 = [0 if item['label'] == 'LABEL_0' else 1 for d in results4 for item in d]
scores4 = [item['score'] for d in results4 for item in d]

labels5 = [0 if item['label'] == 'LABEL_0' else 1 for d in results5 for item in d]
scores5 = [item['score'] for d in results5 for item in d]

labels6 = [0 if item['label'] == 'LABEL_0' else 1 for d in results6 for item in d]
scores6 = [item['score'] for d in results6 for item in d]


Processing with pipe1: 100%|██████████| 5000/5000 [00:44<00:00, 112.12it/s]
Processing with pipe2: 100%|██████████| 5000/5000 [02:10<00:00, 38.18it/s]
Processing with pipe3: 100%|██████████| 5000/5000 [00:44<00:00, 112.38it/s]
Processing with pipe4: 100%|██████████| 5000/5000 [01:21<00:00, 61.16it/s]
Processing with pipe5: 100%|██████████| 5000/5000 [00:43<00:00, 114.20it/s]
Processing with pipe6: 100%|██████████| 5000/5000 [00:44<00:00, 113.26it/s]


In [7]:
'''Get metrics'''
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def getMetrics(predicted_labels, true_labels):
    # Ensure the labels are numpy arrays
    predicted_labels = np.array(predicted_labels)
    true_labels = np.array(true_labels)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='binary')
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    auc = roc_auc_score(true_labels, predicted_labels)

    # Create a dictionary of metrics
    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    }

    return metrics

In [49]:
print(getMetrics(all_predictions,test_df['label'].tolist()))

{'accuracy': 0.5512, 'f1': 0.37423312883435583, 'precision': 0.6178637200736649, 'recall': 0.2684, 'auc': 0.5511999999999999}


In [8]:
print(MODEL1)
print(getMetrics(labels1,test_df['label'].tolist()))
print(MODEL2)
print(getMetrics(labels2,test_df['label'].tolist()))
print(MODEL3)
print(getMetrics(labels3,test_df['label'].tolist()))
print(MODEL4)
print(getMetrics(labels4,test_df['label'].tolist()))
print(MODEL5)
print(getMetrics(labels5,test_df['label'].tolist()))
print(MODEL6)
print(getMetrics(labels6,test_df['label'].tolist()))

bert-base-uncased
{'accuracy': 0.732, 'f1': 0.6903881700554528, 'precision': 0.8172866520787746, 'recall': 0.5976, 'auc': 0.732}
microsoft/deberta-large
{'accuracy': 0.6868, 'f1': 0.570958904109589, 'precision': 0.9060869565217391, 'recall': 0.4168, 'auc': 0.6868000000000001}
roberta-base
{'accuracy': 0.7944, 'f1': 0.7503642544924721, 'precision': 0.9548825710754018, 'recall': 0.618, 'auc': 0.7943999999999999}
roberta-large
{'accuracy': 0.839, 'f1': 0.8466958674538183, 'precision': 0.8080697928026173, 'recall': 0.8892, 'auc': 0.839}
Hello-SimpleAI/chatgpt-detector-roberta
{'accuracy': 0.5, 'f1': 0.6666666666666666, 'precision': 0.5, 'recall': 1.0, 'auc': 0.5}
roberta-base-openai-detector
{'accuracy': 0.5, 'f1': 0.6666666666666666, 'precision': 0.5, 'recall': 1.0, 'auc': 0.5}


In [14]:
'''Complex Ensemble models'''

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

'''Random Forest'''
# Create a DataFrame
df = pd.DataFrame({
    'Labels_Model1': labels1,
    'Scores_Model1': scores1,
    'Labels_Model2': labels2,
    'Scores_Model2': scores2,
    'Labels_Model3': labels3,
    'Scores_Model3': scores3,
    'Labels_Model4': labels4,
    'Scores_Model4': scores4,
    'Labels_Model5': labels5,
    'Scores_Model5': scores5,
    'Labels_Model6': labels6,
    'Scores_Model6': scores6,
})

labels = test_df['label'].tolist()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

# Create a Random Forest Classifier and train it on the training data
clf = RandomForestClassifier(n_estimators=300, random_state=42,min_samples_split=3,min_samples_leaf=1,max_depth=None)
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

print(getMetrics(y_pred,y_test))

(4000, 12)
(1000, 12)
{'accuracy': 0.902, 'f1': 0.902970297029703, 'precision': 0.8941176470588236, 'recall': 0.912, 'auc': 0.9019999999999999}


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [250,275,300,325,350],
    'max_depth': [None],
    'min_samples_split': [3],
    'min_samples_leaf': [1]
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_search.best_params_)

# Train and predict using the best model
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test)

# before optimizing
# 'accuracy': 0.899, 'f1': 0.8999008919722498
# after
# 'accuracy': 0.902, 'f1': 0.902970297029703

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 300}
