# Testing ML techniques to identify YSOs in Spitzer IRAC data

## Breanna Crompvoets 

## Import Libraries and set global variables

In [1]:
# import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# classic ML libraries
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, recall_score, precision_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
import xgboost as xgb

# custom made libraries
from custom_dataloader import replicate_data
custom_labs = ['Class 1', 'Class 2', 'Others']


  from pandas import MultiIndex, Int64Index


# Classical ML Techniques

## Loading Data Set

In [2]:
# data load
X = np.load("../inp.npy")
Y = np.load("../tar.npy") 

# inp_tr, inp_va, tar_tr, tar_va = train_test_split(X,Y)

# inp_te = np.load("../inp_test.npy")
# tar_te = np.load("../tar_test.npy")


# # scaling data according to training inputs
# scaler_S = StandardScaler().fit(inp_tr)
# inp_tr = scaler_S.transform(inp_tr)
# inp_va = scaler_S.transform(inp_va)
# inp_te = scaler_S.transform(inp_te) 

# # printouts for double checking all the sets and amounts
# print('Sizes of Datasets : Inputs , Targets')
# print('------------------------------------')
# print(f'Training set: {inp_tr.shape} , {tar_tr.shape} \nValidation set: {inp_va.shape} , {tar_va.shape} \nTesting Set: {inp_te.shape}, {tar_te.shape}')
# print('------------------------------------')


In [3]:
def bootstrap_estimate(estimator, X, Y, n_splits=200):
                          
    scoresA = []
    scoresP = []
    scoresR = []
    
    for n in range(0,n_splits):
        inp_tr, inp_va, tar_tr, tar_va = train_test_split(X,Y) 
        scaler_S = StandardScaler().fit(inp_tr)
        inp_tr = scaler_S.transform(inp_tr)
        inp_va = scaler_S.transform(inp_va)
        estimator.fit(inp_tr, tar_tr.ravel())  
        pred_va = estimator.predict(inp_va)
        scoresA.append(accuracy_score(tar_va,pred_va))
        scoresR.append(recall_score(tar_va,pred_va,average=None,zero_division=1))  
        scoresP.append(precision_score(tar_va,pred_va,average=None,zero_division=1)) 
    scoresR = list(map(list, zip(*scoresR)))
    scoresP = list(map(list, zip(*scoresP)))

    estimateA = np.mean(scoresA)*100.
    stderrA = np.std(scoresA)*100.
    
    estimateR = [np.mean(scoresR[0])*100.,np.mean(scoresR[1])*100.,np.mean(scoresR[2])*100.]
    stderrR = [np.std(scoresR[0])*100.,np.std(scoresR[1])*100.,np.std(scoresR[2])*100.]
    
    estimateP = [np.mean(scoresP[0])*100.,np.mean(scoresP[1])*100.,np.mean(scoresP[2])*100.]
    stderrP = [np.std(scoresP[0])*100.,np.std(scoresP[1])*100.,np.std(scoresP[2])*100.]
    
    return estimateR, stderrR, estimateP, stderrP, estimateA, stderrA

In [4]:
f = open("PRAScores_ArtificialBalance.txt","w")
f.write("Artificially balanced dataset provided by Hossen Teimoorinia \n")

62

## Logistic Regression

In [5]:
%%time

logreg = LogisticRegression('l1',max_iter=500,solver='saga',tol=0.0001)

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 11.9 µs


In [6]:
classes = ["Class I", "Class II", "Contaminants"]
f.write("XGB Recall & Precision & Accuracy\n")

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(logreg, X, Y, n_splits=200)


for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")


## SVM

In [7]:
svc = SVC(kernel='rbf',gamma='auto',C=0.9)

In [8]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(svc, X, Y, n_splits=200)

f.write("LR Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

## SVM/LR Stacking Ensemble 

In [9]:
estimators = [('svc', SVC(kernel='rbf',gamma='auto',C=0.9,random_state=42))]
stacl = StackingClassifier(estimators=estimators,
                           final_estimator=LogisticRegression(penalty = 'l1', max_iter = 500, solver ='saga', tol =0.0001))

In [10]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(stacl, X, Y, n_splits=200)

f.write("Stack Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

## Random Forest


## Gradient Boosting

In [11]:
boostcl = GradientBoostingClassifier(criterion='friedman_mse',max_depth=7,max_features='log2',
                n_estimators=100,n_iter_no_change=5,subsample=1.0,warm_start=True)

In [12]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(boostcl, X, Y, n_splits=200)

f.write("GB Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

## XGBoost


In [13]:
xgbcl = xgb.XGBClassifier(max_depth=9,sampling_method='uniform',subsample=0.5,use_label_encoder=False,eval_metric='mlogloss')

In [14]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(xgbcl, X, Y, n_splits=200)

f.write("XGB Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

In [15]:
# Final hyperparameters
rfcl = RandomForestClassifier(class_weight='balanced',criterion='entropy',max_features='log2',n_estimators=100,oob_score=False)

In [16]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(rfcl, X, Y, n_splits=200)

f.write("RF Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

In [17]:
f.close()

In [18]:
import os
# os.system('say "tadaaa your program has probably failed"')
os.system('say "beep"')

0

# Neural Network

In this section, we describe in which ways we replicated the CM21 paper, and the choices we made along the way to derive the best results we were able to achieve. This section was performed by S. Fielder.

All the training has been performed prior to this notebook, and the state of the network saved at the appropriate time. Below we will just import the state of the system described, and run the testing set through for evaluation, and metrics along with Confusion Matrixes will be outputted for each subsection.

Many of the functions called below are semi-custom made, and are found in the appropriate `.py` files in this directory. The majority come from `NN_Defs.py` where the model construction along with the training and validation functions are located. Additionally, our custom data split maker is found in `custom_dataloader.py`, in which we can build reproducable sets of training, validation and testing sets depending on the number of subclasses wanted in each set. In this example, we focus on the results achieved by using the CM21 data-split as performed using this loader.

Finally `network_runner.py` was the script used in order to train all of the networks, and appropriately print out the metrics, along with saving plots for both confusion matrixes and loss values. Some of these outputs are shown in the `Saved_Final_Data/` directory. The settings found therein will be imported below to load the state of the networks as mentioned above.

## Importing

In [19]:
# library imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

from sklearn.metrics import ConfusionMatrixDisplay, classification_report, recall_score, precision_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,  GridSearchCV

# custom script inputs
from NN_Defs import get_n_params, train, validate, BaseMLP

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Running on : {device}')


Running on : cpu
