# Testing ML techniques to identify YSOs in Spitzer IRAC data

## Breanna Crompvoets 

## Import Libraries and set global variables

In [11]:
# import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# classic ML libraries
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, recall_score, precision_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
import xgboost as xgb

# custom made libraries
from custom_dataloader import replicate_data
custom_labs = ['Class 1', 'Class 2', 'Others']


# Classical ML Techniques

## Loading Data Set

In [27]:
# data load
X = np.load("../inp.npy")
Y = np.load("../tar.npy")
input_te = np.load("../inp_test.npy")
target_te = np.load("../tar_test.npy") 


In [28]:
def bootstrap_estimate(estimator, X, Y, inp_te, tar_te, n_splits=200):
                          
    scoresA = []
    scoresP = []
    scoresR = []
    
    for n in range(0,n_splits):
        inp_tr, inp_va, tar_tr, tar_va = train_test_split(X,Y) 
        scaler_S = StandardScaler().fit(inp_tr)
        inp_tr = scaler_S.transform(inp_tr)
        inp_va = scaler_S.transform(inp_va)
        inp_te = scaler_S.transform(inp_te)
        estimator.fit(inp_tr, tar_tr.ravel())  
        pred_te = estimator.predict(inp_te)
        scoresA.append(accuracy_score(tar_te,pred_te))
        scoresR.append(recall_score(tar_te,pred_te,average=None,zero_division=1))  
        scoresP.append(precision_score(tar_te,pred_te,average=None,zero_division=1)) 
    scoresR = list(map(list, zip(*scoresR)))
    scoresP = list(map(list, zip(*scoresP)))

    estimateA = np.mean(scoresA)*100.
    stderrA = np.std(scoresA)*100.
    
    estimateR = [np.mean(scoresR[0])*100.,np.mean(scoresR[1])*100.,np.mean(scoresR[2])*100.]
    stderrR = [np.std(scoresR[0])*100.,np.std(scoresR[1])*100.,np.std(scoresR[2])*100.]
    
    estimateP = [np.mean(scoresP[0])*100.,np.mean(scoresP[1])*100.,np.mean(scoresP[2])*100.]
    stderrP = [np.std(scoresP[0])*100.,np.std(scoresP[1])*100.,np.std(scoresP[2])*100.]
    
    return estimateR, stderrR, estimateP, stderrP, estimateA, stderrA

In [19]:
f = open("PRAScores_ArtificialBalance_TestSet.txt","w")
f.write("Artificially balanced dataset provided by Hossen Teimoorinia \n")

62

## Logistic Regression

In [20]:
%%time

logreg = LogisticRegression('l1',max_iter=500,solver='saga',tol=0.0001)

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 8.11 µs


In [21]:
classes = ["Class I", "Class II", "Contaminants"]
f.write("XGB Recall & Precision & Accuracy\n")

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(logreg, X, Y, input_te, target_te, n_splits=200)


for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")


## SVM

In [22]:
svc = SVC(kernel='rbf',gamma='auto',C=0.9)

In [23]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(svc, X, Y, input_te, target_te, n_splits=200)

f.write("LR Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

## SVM/LR Stacking Ensemble 

In [24]:
estimators = [('svc', SVC(kernel='rbf',gamma='auto',C=0.9,random_state=42))]
stacl = StackingClassifier(estimators=estimators,
                           final_estimator=LogisticRegression(penalty = 'l1', max_iter = 500, solver ='saga', tol =0.0001))

In [None]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(stacl, X, Y, input_te, target_te, n_splits=200)

f.write("Stack Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

## Random Forest


In [None]:
# Final hyperparameters
rfcl = RandomForestClassifier(class_weight='balanced',criterion='entropy',max_features='log2',n_estimators=100,oob_score=False)

In [None]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(rfcl, X, Y, input_te, target_te, n_splits=200)

f.write("RF Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

## Gradient Boosting

In [None]:
boostcl = GradientBoostingClassifier(criterion='friedman_mse',max_depth=7,max_features='log2',
                n_estimators=100,n_iter_no_change=5,subsample=1.0,warm_start=True)

In [None]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(boostcl, X, Y, input_te, target_te, n_splits=200)

f.write("GB Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

## XGBoost


In [None]:
xgbcl = xgb.XGBClassifier(max_depth=9,sampling_method='uniform',subsample=0.5,use_label_encoder=False,eval_metric='mlogloss')

In [None]:

estR, stderrR, estP, stderrP, estA, stderrA = bootstrap_estimate(xgbcl, X, Y, input_te, target_te, n_splits=200)

f.write("XGB Recall & Precision & Accuracy\n")
for i, cl in enumerate(classes):
    if i==1:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$ & $"+"{:.1f}".format(estA)+"\pm"+"{:.1f}".format(stderrA)+"$ // \n")
    else:
        f.write(cl+"& $"+"{:.1f}".format(estR[i])+"\pm"+"{:.1f}".format(stderrR[i])+"$ & $"+
            "{:.1f}".format(estP[i])+"\pm"+"{:.1f}".format(stderrP[i])+"$&// \n")

In [None]:
f.close()

In [None]:
import os
# os.system('say "tadaaa your program has probably failed"')
os.system('say "beep"')

0

# Neural Network

## Importing

In [9]:
# library imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

from sklearn.metrics import ConfusionMatrixDisplay, classification_report, recall_score, precision_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,  GridSearchCV

# custom script inputs
from NN_Defs import get_n_params, train, validate, BaseMLP

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Running on : {device}')


Running on : cpu


In [None]:
import random

def bootstrap(NN,epochs):
    train_loader, val_loader = MLP_data_setup(X, Y)
    pred_va, tar_va = main(epochs,NN,optimizer,train_loader,val_loader)
    ScoresA = accuracy_score(tar_va,pred_va)
    ScoresR = recall_score(tar_va,pred_va,average=None,zero_division=1)
    ScoresP = precision_score(tar_va,pred_va,average=None,zero_division=1)

    return ScoresR, ScoresP, ScoresA



def main(epochs, NetInstance, OptInstance, train_loader, val_loader, ScheduleInstance=None):

    for epoch in range(0, epochs):
        train_loss, train_predictions, train_truth_values = train(epoch, NetInstance, OptInstance, train_loader, device)
        val_loss, val_accuracy, val_predictions, val_truth_values = validate(NetInstance, val_loader, device)
        
        if ScheduleInstance is not None:
            ScheduleInstance.step()

    return val_predictions, val_truth_values
    




def MLP_data_setup(X,Y):
    inp_tr, inp_va, tar_tr, tar_va = train_test_split(X,Y)
    
    # scaling data according to training inputs
    scaler_S = StandardScaler().fit(inp_tr)
    inp_tr = scaler_S.transform(inp_tr)
    inp_va = scaler_S.transform(inp_va)

    # creation of tensor instances

    inp_tr = torch.as_tensor(inp_tr)
    tar_tr = torch.as_tensor(tar_tr)
    inp_va = torch.as_tensor(inp_va)
    tar_va = torch.as_tensor(tar_va)

    # pass tensors into TensorDataset instances
    train_data = data_utils.TensorDataset(inp_tr, tar_tr)
    val_data = data_utils.TensorDataset(inp_va, tar_va)

    # constructing data loaders
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=25, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size=25, shuffle=True)
    return train_loader, val_loader

X = np.load("../inp.npy")
Y = np.load("../tar.npy") 


BaseNN = BaseMLP(4, 20, 3, weight_initialize=True)
## load settings in
optimizer = optim.SGD(BaseNN.parameters(), lr=4e-2, momentum=0.9)
train_loader, val_loader = MLP_data_setup(X, Y)
pred_va, tar_va = main(1000,BaseNN,optimizer,train_loader,val_loader)
print(accuracy_score(tar_va,pred_va))
print(recall_score(tar_va,pred_va,average=None,zero_division=1))
print(precision_score(tar_va,pred_va,average=None,zero_division=1))


# estR, stderrR, estP, stderrP, estA, stderrA  = bootstrap_estimate_MLP(BaseNN, X, Y, n_splits=200, epochs =50000)
# iters = [(BaseNN,50000)] * 50
# ans = []
# for n in [0,1,2,3]:

#     with mp.Pool(12) as pool:
#         ans.append(pool.starmap(bootstrap, iters))
#     np.save(f"intermediatesave_onelayerMLP_{n}.npy",ans)
    


