# Getting The Data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from nuc_data_tool.db.fetch_data import fetch_files_by_name, fetch_data_by_filename_and_physical_quantities
files = fetch_files_by_name()

In [2]:
import pandas as pd

nuc_data = pd.DataFrame()

for file in files:
    dict_nuc_data = fetch_data_by_filename_and_physical_quantities(file, 'isotope', True)

    for pq in dict_nuc_data:
        if dict_nuc_data[pq].empty:
            continue

        nuc_data = nuc_data.append(dict_nuc_data[pq], ignore_index=True)

nuc_data.info

<bound method DataFrame.info of          nuc_ix     name first_step last_step      middle_step_1  \
0         10010       H1          0         0  0.000004646090694   
1         10020       H2          0         0  0.000002187007232   
2         10030       H3          0         0   0.00002829193536   
3         10040       H4          0         0               0E-9   
4         10050       H5          0         0               0E-9   
...         ...      ...        ...       ...                ...   
362990  1102720    Ds272          0         0               0E-9   
362991  1102730    Ds273          0         0               0E-9   
362992  1102791  Ds279m1          0         0               0E-9   
362993  1112720    Rg272          0         0               0E-9   
362994  1112990   Pseudo          0         0               0E-9   

            middle_step_2      middle_step_3      middle_step_4  \
0       0.000009344544900   0.00001410187475   0.00001891756544   
1       0.0000043

# Setting up Environment

In [3]:
unnecessary_columns = ['nuc_ix', 'name']
numeric_columns = [col for col in nuc_data.columns.tolist() if col not in unnecessary_columns]

In [4]:
from pycaret.anomaly import *

exp_ano = setup(nuc_data, normalize = True, 
                normalize_method='robust',
                ignore_features = unnecessary_columns,
                numeric_features = numeric_columns,
                session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(362995, 195)"
2,Missing Values,True
3,Numeric Features,193
4,Categorical Features,0
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(362995, 193)"
9,CPU Jobs,-1


# Create a Model

In [5]:
iforest = create_model('iforest', n_jobs=-1, fraction=0.01, n_estimators=256)

In [6]:
iforest

IForest(behaviour='new', bootstrap=False, contamination=0.01,
    max_features=1.0, max_samples='auto', n_estimators=256, n_jobs=-1,
    random_state=123, verbose=0)

# Assign a Model

In [7]:
iforest_results = assign_model(iforest)

In [8]:
iforest_results.head()

Unnamed: 0,nuc_ix,name,first_step,last_step,middle_step_1,middle_step_2,middle_step_3,middle_step_4,middle_step_5,middle_step_6,...,middle_step_184,middle_step_185,middle_step_186,middle_step_187,middle_step_188,middle_step_189,middle_step_190,middle_step_191,Anomaly,Anomaly_Score
0,10010,H1,0,0,4.646090694e-06,9.3445449e-06,1.410187475e-05,1.891756544e-05,2.379071329e-05,2.872040805e-05,...,,,,,,,,,0,-0.029097
1,10020,H2,0,0,2.187007232e-06,4.388726293e-06,6.607321145e-06,8.842630832e-06,1.10943665e-05,1.336223707e-05,...,,,,,,,,,0,-0.031314
2,10030,H3,0,0,2.829193536e-05,5.667838403e-05,8.518316066e-05,0.0001138057392,0.0001425441966,0.0001713965614,...,,,,,,,,,0,-0.026371
3,10040,H4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0,-0.124191
4,10050,H5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0,-0.124191


In [9]:
iforest_results[iforest_results.Anomaly == 1]

Unnamed: 0,nuc_ix,name,first_step,last_step,middle_step_1,middle_step_2,middle_step_3,middle_step_4,middle_step_5,middle_step_6,...,middle_step_184,middle_step_185,middle_step_186,middle_step_187,middle_step_188,middle_step_189,middle_step_190,middle_step_191,Anomaly,Anomaly_Score
11179,932390,Np239,0,0,0.0004829907170,0.001311923535,0.002190465713,0.003068430559,0.003937409398,0.004796083377,...,,,,,,,,,1,0.006412
18821,932390,Np239,0,0,0.0004929492885,0.001338973505,0.002235629955,0.003131697168,0.004018593097,0.004894971694,...,,,,,,,,,1,0.006412
22642,932390,Np239,0,0,0.0004879700028,0.001325448520,0.002213047834,0.003100063863,0.003978001248,0.004845527536,...,,,,,,,,,1,0.006412
26463,932390,Np239,0,0,0.0004829907170,0.001311923535,0.002190465713,0.003068430559,0.003937409398,0.004796083377,...,,,,,,,,,1,0.006412
30262,922360,U236,0,0,0.0003154730051,0.0006309399849,0.0009464009394,0.001261855869,0.001577304773,0.001892747653,...,,,,,,,,,1,0.000507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361299,621520,Sm152,0,0,0.0007262910292,0.001486931129,0.002283499545,0.003122343011,0.004006200929,0.004935781313,...,,,,,,,,,1,0.020330
361345,631530,Eu153,0,0,0.00006771642981,0.0002443811383,0.0004981644975,0.0008076552284,0.001158506468,0.001541158355,...,,,,,,,,,1,0.012146
362709,932370,Np237,0,0,0.00004979919478,0.0001947034536,0.0004284677625,0.0007454535286,0.001140567295,0.001609205661,...,,,,,,,,,1,0.023099
362732,942400,Pu240,0,0,0.0002761976058,0.001227239278,0.002933588468,0.005446482148,0.008795989920,0.01299642242,...,,,,,,,,,1,0.024707


In [13]:
iforest_results[iforest_results.Anomaly == 1]["nuc_ix"].T.nunique()

208

# Plot a Model

In [None]:
plot_model(iforest)

In [None]:
plot_model(iforest, plot = 'umap')

# Saving the Model

In [10]:
save_model(iforest,'nuc_all_steps_isotope_iforest_0.01_model')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['nuc_ix', 'name'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['first_step',
                                                           'last_step',
                                                           'middle_step_1',
                                                           'middle_step_2',
                                                           'middle_step_3',
                                                           'middle_step_4',
                                                           'middle_step_5',
                                                           'middle_step_6',
                                                           'middle_step

# Predict on Unseen Data

In [None]:
predictions = predict_model(iforest, data=nuc_data)

In [None]:
predictions.head()

In [None]:
predictions[predictions.Anomaly == 1]

# Loading the Saved Model

In [None]:
from pycaret.anomaly import *
saved_iforest = load_model('nuc_all_steps_isotope_model')

In [None]:
new_prediction = predict_model(saved_iforest, data=nuc_data)

In [None]:
new_prediction.head()

In [None]:
new_prediction[new_prediction.Label == 1]