In [1]:
import pandas as pd
import numpy as np
import torch
import os
x = torch.load(os.path.join('..', 'output', 'embeddings', 'train_embeds_videonames_concatenated.pt'))
y = x.to_list()
y = torch.stack(y)
X_full = y.numpy()
x = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
y_full = x['label'].to_numpy()
x = torch.load(os.path.join('..', 'output', 'embeddings', 'test_embeds_videonames_concatenated.pt'))
y = x.to_list()
y = torch.stack(y)
X_test = y.numpy()

In [8]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe



# Replace X and y with your feature and target data
X_train, X_dev, y_train, y_dev = train_test_split(X_full, y_full, test_size=0.1, random_state=42)


space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 100
    }


def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']), early_stopping_rounds=10, eval_metric="auc")
    
    evaluation = [( X_train, y_train), ( X_dev, y_dev)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,verbose=False)
    

    pred = clf.predict(X_dev)
    accuracy = accuracy_score(y_dev, pred>0.5)
    # calculate f1 score
    f1 = f1_score(y_dev, pred>0.5)

    print ("SCORE:", f1)
    return {'loss': -f1, 'status': STATUS_OK }



In [9]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)

SCORE:                                                
0.0                                                   
SCORE:                                                            
0.0                                                               
SCORE:                                                            
0.0                                                               
SCORE:                                                            
0.0                                                               
SCORE:                                                            
0.0                                                               
SCORE:                                                            
0.0                                                               
SCORE:                                                            
0.0                                                               
SCORE:                                                            
0.0                

In [33]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.8089595434805966, 'gamma': 8.402954169930872, 'max_depth': 12.0, 'min_child_weight': 3.0, 'reg_alpha': 103.0, 'reg_lambda': 0.7408520399302297}


In [41]:
best_hyperparams['max_depth'] = int(best_hyperparams['max_depth'])

## Making Predictions

In [10]:
# make predictions for test data

import pandas as pd
import numpy as np
import torch
import os
x = torch.load(os.path.join('..', 'output', 'embeddings', 'train_embeds_videonames_concatenated.pt'))
y = x.to_list()
y = torch.stack(y)
X_full = y.numpy()
x = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
y_full = x['label'].to_numpy()
x = torch.load(os.path.join('..', 'output', 'embeddings', 'test_embeds_videonames_concatenated.pt'))
y = x.to_list()
y = torch.stack(y)
X_test = y.numpy()


# Create an XGBoost Classifier
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
xgb_classifier.fit(X_full, y_full)

# Make predictions on the test data
y_pred = xgb_classifier.predict(X_test)

In [11]:
# create a submission file
submission = pd.DataFrame({'ID': list(range(1, len(y_pred) + 1)), 'label': y_pred})

# save the submission file
submission.to_csv(os.path.join('..', 'output', 'xgboost_submission.csv'), index=False)

In [12]:
len(y_pred)

614

In [46]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
# make predictions for test data

import pandas as pd
import numpy as np
import torch
import os

x = torch.load(os.path.join('..', 'output', 'embeddings', 'train_embeds_videonames_extracted_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
x = torch.load(os.path.join('..', 'output', 'embeddings', 'train_embeds_videonames_concatenated.pt'))
z = x.to_list()
z = [t.tolist() for t in z]
y = [torch.tensor(y[i] + z[i]) for i in range(len(y))]
y = torch.stack(y)
X_full = y.numpy()

x = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
y_full = x['label'].to_numpy()

x = torch.load(os.path.join('..', 'output', 'embeddings', 'test_embeds_videonames_extracted_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
x = torch.load(os.path.join('..', 'output', 'embeddings', 'test_embeds_videonames_concatenated.pt'))
z = x.to_list()
z = [t.tolist() for t in z]
y = [torch.tensor(y[i] + z[i]) for i in range(len(y))]
y = torch.stack(y)
X_test = y.numpy()


# Create an XGBoost Classifier
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
xgb_classifier.fit(X_full, y_full)

# Make predictions on the test data
y_pred = xgb_classifier.predict(X_test)

# create a submission file
submission = pd.DataFrame({'ID': list(range(1, len(y_pred) + 1)), 'label': y_pred})

# save the submission file
submission.to_csv(os.path.join('..', 'output', 'xgboost_submission.csv'), index=False)

len(y_pred)

614