# Classifying Class I, II, and others using XGBoost

Using data from the four IRAC bands (3.6, 4.5, 5.8, and 8  μm), we classify each object as "other", Class I or Class II protostars. We use a XGBoost Classifier with values determined via a previosu GridSearch.

This data comes from Cornu and Montillaud (2021) (https://cdsarc.cds.unistra.fr/viz-bin/cat/J/A+A/647/A116) and includes Spitzer data of the Orion and NGC 2264 star forming regions.

In [25]:
# import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# classic ML libraries
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, recall_score, precision_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# custom made libraries
from custom_dataloader import replicate_data

In [26]:
# xgb_1 = xgb.XGBClassifier(use_label_encoder=False,eval_metric='mlogloss',gamma=1)

# parameters = {'subsample':[0.5,1.0],'max_depth':np.arange(1,11,2),'sampling_method':['uniform']}
# xgbcl = GridSearchCV(xgb_1, parameters)
# xgbcl.fit(inp_tr,tar_tr.ravel())
# print(xgbcl.best_params_)
# print(xgbcl.best_estimator_)

xgbcl = xgb.XGBClassifier(max_depth=7,sampling_method='uniform',subsample=1.0,gamma=2,use_label_encoder=False,eval_metric='mlogloss')


## Bootstrapping for Errors

In [27]:
def bootstrap(estimator):
    inp_tr, tar_tr, inp_va, tar_va, inp_te, tar_te = replicate_data(X, Y, 'seven', amounts_train, amounts_val,random.randint(0,1000))
    # scaling data according to training inputs
    scaler_S = StandardScaler().fit(inp_tr)
    inp_tr = scaler_S.transform(inp_tr)
    inp_va = scaler_S.transform(inp_va)
    estimator.fit(inp_tr, tar_tr.ravel())  
    pred_va = estimator.predict(inp_va)
    ScoresA = accuracy_score(tar_va,pred_va)
    ScoresR = recall_score(tar_va,pred_va,average=None,zero_division=1)
    ScoresP = precision_score(tar_va,pred_va,average=None,zero_division=1)

    return ScoresR, ScoresP, ScoresA



In [29]:
X = np.load("Input_Class_AllClasses_Sep.npy")
Y = np.load("Target_Class_AllClasses_Sep.npy") # For original targets via Gutermuth 2009 Method

amounts_train = [331,331,331,331,27,70,331]
amounts_val = [82, 531, 104, 278, 6, 17, 4359]


xgbcl = xgb.XGBClassifier(max_depth=7,sampling_method='uniform',subsample=1.0,gamma=2,use_label_encoder=False,eval_metric='mlogloss')
iters = [xgbcl] * 10

num_cores = 2

with mp.Pool(num_cores) as pool:
    ans = pool.map(bootstrap, iters)


In [30]:
# print(list(map(list, zip(*ans))))

scoresR = list(map(list, zip(*ans)))[0]
scoresP = list(map(list, zip(*ans)))[1]
scoresA = list(map(list, zip(*ans)))[2]


scoresR = list(map(list, zip(*scoresR)))
scoresP = list(map(list, zip(*scoresP)))



estA = np.mean(scoresA)*100.
stderrA = np.std(scoresA)*100.

estR = [np.mean(scoresR[0])*100.,np.mean(scoresR[1])*100.,np.mean(scoresR[2])*100.,np.mean(scoresR[3])*100.,np.mean(scoresR[4])*100.,np.mean(scoresR[5])*100.,np.mean(scoresR[6])*100.]
stderrR = [np.std(scoresR[0])*100.,np.std(scoresR[1])*100.,np.std(scoresR[2])*100.,np.std(scoresR[3])*100.,np.std(scoresR[4])*100.,np.std(scoresR[5])*100.,np.std(scoresR[6])*100.]

estP = [np.mean(scoresP[0])*100.,np.mean(scoresP[1])*100.,np.mean(scoresP[2])*100.,np.mean(scoresP[3])*100.,np.mean(scoresP[4])*100.,np.mean(scoresP[5])*100.,np.mean(scoresP[6])*100.]
stderrP = [np.std(scoresP[0])*100.,np.std(scoresP[1])*100.,np.std(scoresP[2])*100.,np.std(scoresP[3])*100.,np.std(scoresP[4])*100.,np.std(scoresP[5])*100.,np.std(scoresP[6])*100.]


In [32]:
classes = ["Class I", "Class II", "Galaxies", "AGNs", "Shocks", "PAHs", "Stars"]
f = open("PRAScores_XGB_7Classes.txt", "w")
f.write("XGB Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==3:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")


f.close()