In [1]:
import numpy as np
import pickle
import random
import xgboost as xgb
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import catboost

In [2]:
def serialize(obj, path: str):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def deserialize(path: str):
    with open(path, 'rb') as f:
        temp = pickle.load(f)
    return temp

In [3]:
model_name = "electra"
source_folder_name = "embeddings"
destination_folder_name = "model"

# SYSTEM ARGS
algorithm = "cat" # or "xgb"
embeddingfile = source_folder_name + "/embeddings-" + model_name + ".pkl"

SEED = 42

In [25]:
random.seed(SEED)

In [4]:
merged_data = deserialize(embeddingfile)

In [27]:
merged_data.shape

(37126, 769)

In [28]:
target_attribute_index = 768
y = merged_data[:, target_attribute_index]

In [29]:
X = np.delete(merged_data, target_attribute_index, axis=1)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
if algorithm == "xgb":
    model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
elif algorithm == "cat":
    model = catboost.CatBoostClassifier(random_state=42, task_type="GPU")
else:
    raise Exception(f"algorithm should either be xgb or cat but {algorithm} has given")

In [32]:
model.fit(X_train, y_train)

Learning rate set to 0.027975
0:	learn: 0.6678578	total: 614ms	remaining: 10m 13s
1:	learn: 0.6467124	total: 1.04s	remaining: 8m 40s
2:	learn: 0.6244260	total: 1.47s	remaining: 8m 8s
3:	learn: 0.6054872	total: 1.91s	remaining: 7m 55s
4:	learn: 0.5868533	total: 2.33s	remaining: 7m 43s
5:	learn: 0.5700224	total: 2.77s	remaining: 7m 39s
6:	learn: 0.5537810	total: 3.22s	remaining: 7m 37s
7:	learn: 0.5388955	total: 3.66s	remaining: 7m 33s
8:	learn: 0.5251386	total: 4.09s	remaining: 7m 30s
9:	learn: 0.5107059	total: 4.54s	remaining: 7m 29s
10:	learn: 0.4961485	total: 4.97s	remaining: 7m 26s
11:	learn: 0.4843067	total: 5.41s	remaining: 7m 25s
12:	learn: 0.4737829	total: 5.84s	remaining: 7m 23s
13:	learn: 0.4633356	total: 6.27s	remaining: 7m 21s
14:	learn: 0.4532170	total: 6.72s	remaining: 7m 21s
15:	learn: 0.4432866	total: 7.17s	remaining: 7m 21s
16:	learn: 0.4346787	total: 7.59s	remaining: 7m 18s
17:	learn: 0.4263347	total: 8.03s	remaining: 7m 18s
18:	learn: 0.4182931	total: 8.45s	remaining:

<catboost.core.CatBoostClassifier at 0x18ed9e639a0>

In [33]:
serialize(model, destination_folder_name + "/" + model_name + "_" + algorithm + "_based_model.pkl")

In [34]:
model = deserialize(destination_folder_name + "/" + model_name + "_" + algorithm + "_based_model.pkl")

In [35]:
y_pred = model.predict(X_test)

In [36]:
confusion_matrix(y_test, y_pred)

array([[2789,  142],
       [  84, 4411]], dtype=int64)

In [37]:
def check_file(file_path):
    if not os.path.exists(file_path):
        with open(file_path, "a") as _:
            pass
        #os.mknod(file_path)
    return None

In [38]:
check_file(source_folder_name + "/" + model_name + "_based_model.pkl")

In [39]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 96.96%


In [40]:
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision * 100:.2f}%")

Precision: 96.88%


In [41]:
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall * 100:.2f}%")

Recall: 98.13%


In [42]:
f1 = f1_score(y_test, y_pred)
print(f"F1 score: {f1 * 100:.2f}%")

F1 score: 97.50%
