<a href="https://colab.research.google.com/github/bcrompvoets/Star_Formation/blob/main/SF_Classify_XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classifying Class I, II, and others using XGBoost

Using data from the four IRAC bands (3.6, 4.5, 5.8, and 8  μm), we classify each object as "other", Class I or Class II protostars. We use a XGBoost Classifier with default values. 

This data comes from Cornu and Montillaud (2021) (https://cdsarc.cds.unistra.fr/viz-bin/cat/J/A+A/647/A116) and includes Spitzer data of the Orion and NGC 2264 star forming regions.

In [1]:
# import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# classic ML libraries
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, recall_score, precision_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# custom made libraries
from custom_dataloader import replicate_data

  from pandas import MultiIndex, Int64Index


In [23]:

def bootstrap_estimate_and_ci(estimator, X_tr, y_tr, X_va, y_va, scoring_func=None, random_seed=0, 
                               alpha=0.05, n_splits=200):
                        
    scores = []

    if scoring_func == accuracy_score:
        for n in range(0,n_splits):
            estimator.fit(X_tr, y_tr.ravel())  
            scores.append(scoring_func(y_va,estimator.predict(X_va)))
            # scores = list(map(list, zip(*scores)))
        estimate = np.mean(scores)*100.
        stderr = np.std(scores)*100. 

    else:
        for n in range(0,n_splits):
            estimator.fit(X_tr, y_tr.ravel())  
            scores.append(scoring_func(y_va,estimator.predict(X_va),average=None))   
            scores = list(map(list, zip(*scores)))
    
        estimate = [np.mean(scores[0])*100.,np.mean(scores[1])*100.,np.mean(scores[2])*100.,np.mean(scores[3])*100.,np.mean(scores[4])*100.,np.mean(scores[5])*100.,np.mean(scores[6])*100.]
        stderr = [np.std(scores[0])*100.,np.std(scores[1])*100.,np.std(scores[2])*100.,np.std(scores[3])*100.,np.std(scores[4])*100.,np.std(scores[5])*100.,np.std(scores[6])*100.]
    
    return estimate, stderr

In [3]:
# data load
X = np.load("Input_Class_AllClasses_Sep.npy")
Y = np.load("Target_Class_AllClasses_Sep.npy") # For original targets via Gutermuth 2009 Method


# custom data loader to pull in custom sized data set
# use seed to get replicable results for now
seed_val = 1111

# the amounts below are how many of each class of object you want in the training set and validation set - leftover amounts given to testing set

# CM21 Split
# amounts_train = [331,1141,231,529,27,70,1257]
amounts_train = [331,331,331,331,27,70,331]
amounts_val = [82, 531, 104, 278, 6, 17, 4359]

# calling custom datagrabber here
inp_tr, tar_tr, inp_va, tar_va, inp_te, tar_te = replicate_data(X, Y, 'seven', amounts_train, amounts_val, seed_val)

# scaling data according to training inputs
scaler_S = StandardScaler().fit(inp_tr)
inp_tr = scaler_S.transform(inp_tr)
inp_va = scaler_S.transform(inp_va)
inp_te = scaler_S.transform(inp_te) # Comment out for 75/25 split

# printouts for double checking all the sets and amounts
print('Sizes of Datasets : Inputs , Targets')
print('------------------------------------')
print(f'Training set: {inp_tr.shape} , {tar_tr.shape} \nValidation set: {inp_va.shape} , {tar_va.shape} \nTesting Set: {inp_te.shape}, {tar_te.shape}')
print('------------------------------------')


Sizes of Datasets : Inputs , Targets
------------------------------------
Training set: (1752, 8) , (1752,) 
Validation set: (5377, 8) , (5377,) 
Testing Set: (19774, 8), (19774,)
------------------------------------


In [4]:
# xgb_1 = xgb.XGBClassifier(use_label_encoder=False,eval_metric='mlogloss',gamma=1)

# parameters = {'subsample':[0.5,1.0],'max_depth':np.arange(1,11,2),'sampling_method':['uniform']}
# xgbcl = GridSearchCV(xgb_1, parameters)


xgbcl = xgb.XGBClassifier(max_depth=7,sampling_method='uniform',subsample=1.0,gamma=2,use_label_encoder=False,eval_metric='mlogloss')


In [5]:
%%time

xgbcl.fit(inp_tr,tar_tr.ravel())  # fit the model with training set
# save in JSON format
xgbcl.save_model("XGB_Settings.json")
# save in text format
xgbcl.save_model("XGB_Settings.txt")

# Find the predicted values
# pred_tr = xgbcl.predict(inp_tr)
# pred_va = xgbcl.predict(inp_va)

# print(classification_report(tar_va,pred_va))
# print(classification_report(tar_tr,pred_tr))

CPU times: user 6.52 s, sys: 3.4 s, total: 9.92 s
Wall time: 1.97 s


In [6]:
# f1_tr = []
# f1_va = []
# G = np.arange(0,15)
# for g in G:
#     xgbcl = xgb.XGBClassifier(gamma=g,max_depth=7,sampling_method='uniform',subsample=0.5,use_label_encoder=False,eval_metric='mlogloss')
#     xgbcl.fit(inp_tr,tar_tr)
#     pred_tr = xgbcl.predict(inp_tr)
#     pred_va = xgbcl.predict(inp_va)

#     f1_tr.append(f1_score(tar_tr,pred_tr,average=None))
#     f1_va.append(f1_score(tar_va,pred_va,average=None))
    




In [7]:
# plt.subplots(1,1,figsize=(12,8))
# import matplotlib.pylab as pl

# n = 7
# colors = pl.cm.Reds(np.linspace(0,1,7))

# # plt.set_cmap('Blues')
# plt.plot(G,f1_tr,label=['T C1','T_C2','T_Ga','T_A','T_Sh','T_P','T_St'])

# # plt.set_cmap('Reds')
# plt.plot(G,f1_va,label='Validate')


# plt.legend()
# plt.grid(True)

In [8]:
# print(xgbcl.best_params_)
# print(xgbcl.best_estimator_)



## Bootstrapping for Errors

In [24]:
# Recall Scores
estR, stderrR = bootstrap_estimate_and_ci(xgbcl, inp_tr, tar_tr.ravel(),  inp_va, tar_va, scoring_func=recall_score, random_seed=0, 
                              alpha=0.05, n_splits=200)


# Precision Scores
estP, stderrP = bootstrap_estimate_and_ci(xgbcl, inp_tr, tar_tr.ravel(), inp_va, tar_va.ravel(), scoring_func=precision_score, random_seed=0, 
                              alpha=0.05, n_splits=200)


# Accuracy Score
estA, stderrA = bootstrap_estimate_and_ci(xgbcl, inp_tr, tar_tr.ravel(), inp_va, tar_va.ravel(), scoring_func=accuracy_score, random_seed=0, 
                               alpha=0.05, n_splits=200)


CPU times: user 18min 38s, sys: 4min 12s, total: 22min 51s
Wall time: 5min 12s


In [27]:
classes = ["Class I", "Class II", "Galaxies", "AGNs", "Shocks", "PAHs", "Stars"]
f = open("PRAScores_XGB_7Classes.txt", "w")
f.write("XGB Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==3:
        f.write(cl+"& $"+"{:.3f}".format(estR[i])+"\pm"+"{:.3f}".format(stderrR[i])+"$ & $"+
            "{:.3f}".format(estP[i])+"\pm"+"{:.3f}".format(stderrP[i])+"$ & $"+"{:.3f}".format(estA)+"\pm"+"{:.3f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.3f}".format(estR[i])+"\pm"+"{:.3f}".format(stderrR[i])+"$ & $"+
            "{:.3f}".format(estP[i])+"\pm"+"{:.3f}".format(stderrP[i])+"$&// \n")


f.close()

0
1
2
3
4
5
6
