# XGBoost Classifier

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# imports and path setup
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import numpy as np
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from xgboost import XGBClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from userkits.features import *
from userkits.utils import load_train_data, load_eval_data

In [3]:
# load data from train and eval directories
X, y = load_train_data(data_dir='../train')
X, y = shuffle(X, y, random_state=42)
X_eval = load_eval_data(data_dir='../eval')

In [4]:
def extract_features(images):
    features_list = []
    def process_image(img):
        feats = []
        # add feature functions here
        feats.extend(color_histogram(img))
        feats.extend(lbp_texture_features(img))
        feats.extend(find_mean(img))
        feats.extend(find_stddev(img))
        return feats

    features_list = Parallel(n_jobs=-1)(delayed(process_image)(img) for img in tqdm(images, desc="Extracting features"))
    return np.array(features_list)

In [5]:
X_features = extract_features(X)
X_features.shape

Extracting features:   0%|          | 0/17735 [00:00<?, ?it/s]

(17735, 528)

In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Split the data and train the model
X_train, X_test, y_train, y_test = train_test_split(X_features, y_encoded, test_size=0.2)  # you can change test_size
clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)  # you can tune hyperparameters here
clf.fit(X_train, y_train)
print("Train Accuracy:", clf.score(X_train, y_train))
print("Test Accuracy:", clf.score(X_test, y_test))

Train Accuracy: 0.8667183535382013
Test Accuracy: 0.579644770228362


## Evaluate

In [8]:
X_eval_features = extract_features(X_eval)
eval_predictions = clf.predict(X_eval_features)
print(eval_predictions[:5])

Extracting features:   0%|          | 0/4434 [00:00<?, ?it/s]

[39 31 36  3  3]
