## XGBoost Classifier Algorithm

In [None]:
# XgBoost Model creation
def dump_model(model, filename):
    """
    Function store the object data to the file write-binary (wb) mode
    """
    pickle.dump(model, open(filename,"wb"))

def load_model(filename):
    """
    Function reads the pickled byte stream object from a file object
    """
    if not os.path.isfile(filename):
        return None
    return pickle.load(open(filename, "rb"))

# Model Training Loop
def main_loop():
    """
    Created main loop for Xgboost classifier
    """
    train = pd.read_csv("data/train.csv")
    test = pd.read_csv("data/test.csv")
    # dataset 
    train_dataset = dataset(train)
    test_dataset  = dataset(test)
    # dataloader 
    train_dataloader  =  DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
    test_dataloader   =  DataLoader(test_dataset, batch_size=32, shuffle=True, drop_last=True)
    model = AutoModelForSequenceClassification.from_pretrained("roberta-base", output_hidden_states=True) # 768 * 2

    for param in model.parameters():
        param.require_grads = False
    model.to(device)
    # XGBoost Classifier
    xgb_model = XGBClassifier()
    filename_xgboost_model = "xgb_class.pkl"
    for ep in tqdm(range(100)): #100 iterations
        total_loss = 0.0
        train_f1 = []
        train_acc = []
        for idx, data in enumerate(train_dataloader):
            if load_model(filename_xgboost_model) != None:
                xgb_model = load_model(filename_xgboost_model)

            input_ids = data["input_ids"].to(device).squeeze()
            attention_mask = data["attention_mask"].to(device).squeeze()
            labels = data["labels"].to(device)
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
            embeddings = torch.mean(outputs.hidden_states[-1], dim=1).squeeze()
            embeddings = embeddings.detach().cpu().numpy()
            gt         = labels.detach().cpu().numpy()
            xgb_model.fit(embeddings, gt, verbose=True)
            dump_model(xgb_model, filename_xgboost_model)
            prediction = xgb_model.predict(embeddings)
            predictions = [round(value) for value in prediction]
            accuracy = accuracy_score(gt, predictions)
            f1Score  = f1_score(gt, predictions)
            train_acc.append(accuracy)
            train_f1.append(f1Score)
            
        if ep%20 == 0: # result after every 20 episodes
            # Testing purpose
            model.eval()
            with torch.no_grad():
                total_test_loss = 0.0
                test_f1 = []
                test_acc =[]

                for idx, data in tqdm(enumerate(test_dataloader)):
                    input_ids = data["input_ids"].to(device).squeeze()
                    attention_mask = data["attention_mask"].to(device).squeeze()
                    labels = data["labels"].to(device)
                    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
                    embeddings = torch.mean(outputs.hidden_states[-1], dim=1).squeeze()
                    embeddings = embeddings.detach().cpu().numpy()
                    gt         = labels.detach().cpu().numpy()
                    prediction = xgb_model.predict(embeddings)
                    predictions = [round(value) for value in prediction]
                    accuracy = accuracy_score(gt, predictions)
                    f1Score  = f1_score(gt, predictions)
                    test_acc.append(accuracy)
                    test_f1.append(f1Score)

                print(f'Train F1 {np.array(train_f1).mean()} and Test F1 {np.array(test_f1).mean()}')
                print(f'Train Accuracy {np.array(train_acc).mean()} and Test Accuracy {np.array(test_acc).mean()}')
main_loop()

In [None]:
### Result
# XGBoost Model iteration result/ summary with model f1_score and accuracy 

# 1%|          | 1/100 [03:40<6:04:24, 220.85s/it]Train F1 1.0 and Test F1 0.8224122351402976
# Train Accuracy 1.0 and Test Accuracy 0.8152173913043478
# 21%|██        | 21/100 [59:16<3:57:46, 180.59s/it]Train F1 1.0 and Test F1 0.8299674575679544
# Train Accuracy 1.0 and Test Accuracy 0.8434103260869565
# 41%|████      | 41/100 [1:54:48<2:57:31, 180.54s/it]Train F1 1.0 and Test F1 0.8230136992151285
# Train Accuracy 1.0 and Test Accuracy 0.8230298913043478
# 61%|██████    | 61/100 [2:50:26<1:57:42, 181.08s/it]Train F1 1.0 and Test F1 0.7743025141140807
# Train Accuracy 1.0 and Test Accuracy 0.7800611413043478
# 81%|████████  | 81/100 [3:46:05<57:14, 180.74s/it]Train F1 1.0 and Test F1 0.8060651487815814
# Train Accuracy 1.0 and Test Accuracy 0.807235054347826