# Getting The Data

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from nuc_data_tool.db.fetch_data import fetch_files_by_name, fetch_data_by_filename_and_physical_quantities
files = fetch_files_by_name()

In [2]:
import pandas as pd

nuc_data = pd.DataFrame(columns=['nuc_ix', 'name'])

for file in files:
    dict_nuc_data = fetch_data_by_filename_and_physical_quantities(file, 'isotope', True)

    for pq in dict_nuc_data:
        if dict_nuc_data[pq].empty:
            continue

        dict_nuc_data[pq].rename(columns={'first_step': f'{file.name}_first_step',
                                         'last_step': f'{file.name}_last_step'},
                                inplace=True)
        columns = {col: f'{file.name}_{col}'
                           for col in dict_nuc_data[pq].columns.tolist()
                           if 'middle_step' in col}
        dict_nuc_data[pq].rename(columns=columns, inplace=True)
        nuc_data = pd.merge(nuc_data, dict_nuc_data[pq], how='outer', on=['nuc_ix', 'name'])

In [8]:
nuc_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3821 entries, 0 to 3820
Data columns (total 654 columns):
 #    Column                                                 Dtype 
---   ------                                                 ----- 
 0    nuc_ix                                                 int64 
 1    name                                                   object
 2    mu_day_UO2Flux_CRAM_1ton_100steps_first_step           object
 3    mu_day_UO2Flux_CRAM_1ton_100steps_last_step            object
 4    mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_1        object
 5    mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_2        object
 6    mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_3        object
 7    mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_4        object
 8    mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_5        object
 9    mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_6        object
 10   mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_7        object
 11   mu

# Setting up Environment

In [5]:
unnecessary_columns = ['nuc_ix', 'name']
numeric_columns = [col for col in nuc_data.columns.tolist() if col not in unnecessary_columns]

In [6]:
from decimal import Decimal

for y, row in enumerate(nuc_data.itertuples(index=False)):
    for x, ele in enumerate(row):
        if not isinstance(ele, Decimal):
            print(f'{x},{y}: {ele}')

0,0: 10010
1,0: H1
0,1: 10020
1,1: H2
0,2: 10030
1,2: H3
0,3: 10040
1,3: H4
0,4: 10050
1,4: H5
0,5: 10060
1,5: H6
0,6: 10070
1,6: H7
0,7: 20030
1,7: He3
0,8: 20040
1,8: He4
0,9: 20050
1,9: He5
0,10: 20060
1,10: He6
0,11: 20070
1,11: He7
0,12: 20080
1,12: He8
0,13: 20090
1,13: He9
0,14: 20100
1,14: He10
0,15: 30040
1,15: Li4
0,16: 30050
1,16: Li5
0,17: 30060
1,17: Li6
0,18: 30070
1,18: Li7
0,19: 30080
1,19: Li8
0,20: 30090
1,20: Li9
0,21: 30100
1,21: Li10
0,22: 30110
1,22: Li11
0,23: 30120
1,23: Li12
0,24: 40050
1,24: Be5
0,25: 40060
1,25: Be6
0,26: 40070
1,26: Be7
0,27: 40080
1,27: Be8
0,28: 40090
1,28: Be9
0,29: 40100
1,29: Be10
0,30: 40110
1,30: Be11
0,31: 40120
1,31: Be12
0,32: 40130
1,32: Be13
0,33: 40140
1,33: Be14
0,34: 40150
1,34: Be15
0,35: 40160
1,35: Be16
0,36: 50060
1,36: B6
0,37: 50070
1,37: B7
0,38: 50080
1,38: B8
0,39: 50090
1,39: B9
0,40: 50100
1,40: B10
0,41: 50110
1,41: B11
0,42: 50120
1,42: B12
0,43: 50130
1,43: B13
0,44: 50140
1,44: B14
0,45: 50150
1,45: B15
0,46: 50

1,729: Ge73m1
0,730: 320740
1,730: Ge74
0,731: 320750
1,731: Ge75
0,732: 320751
1,732: Ge75m1
0,733: 320760
1,733: Ge76
0,734: 320770
1,734: Ge77
0,735: 320771
1,735: Ge77m1
0,736: 320780
1,736: Ge78
0,737: 320790
1,737: Ge79
0,738: 320791
1,738: Ge79m1
0,739: 320800
1,739: Ge80
0,740: 320810
1,740: Ge81
0,741: 320811
1,741: Ge81m1
0,742: 320820
1,742: Ge82
0,743: 320830
1,743: Ge83
0,744: 320840
1,744: Ge84
0,745: 320850
1,745: Ge85
0,746: 320860
1,746: Ge86
0,747: 320870
1,747: Ge87
0,748: 320880
1,748: Ge88
0,749: 320890
1,749: Ge89
0,750: 330600
1,750: As60
0,751: 330610
1,751: As61
0,752: 330620
1,752: As62
0,753: 330630
1,753: As63
0,754: 330640
1,754: As64
0,755: 330650
1,755: As65
0,756: 330660
1,756: As66
0,757: 330670
1,757: As67
0,758: 330680
1,758: As68
0,759: 330690
1,759: As69
0,760: 330700
1,760: As70
0,761: 330710
1,761: As71
0,762: 330720
1,762: As72
0,763: 330730
1,763: As73
0,764: 330740
1,764: As74
0,765: 330750
1,765: As75
0,766: 330751
1,766: As75m1
0,767: 330760


1,1313: Pd115m1
0,1314: 461160
1,1314: Pd116
0,1315: 461170
1,1315: Pd117
0,1316: 461171
1,1316: Pd117m1
0,1317: 461180
1,1317: Pd118
0,1318: 461190
1,1318: Pd119
0,1319: 461200
1,1319: Pd120
0,1320: 461210
1,1320: Pd121
0,1321: 461220
1,1321: Pd122
0,1322: 461230
1,1322: Pd123
0,1323: 461240
1,1323: Pd124
0,1324: 461250
1,1324: Pd125
0,1325: 461260
1,1325: Pd126
0,1326: 470930
1,1326: Ag93
0,1327: 470940
1,1327: Ag94
0,1328: 470941
1,1328: Ag94m1
0,1329: 470942
1,1329: Ag94m2
0,1330: 470950
1,1330: Ag95
0,1331: 470951
1,1331: Ag95m1
0,1332: 470952
1,1332: Ag95m2
0,1333: 470953
1,1333: Ag95m3
0,1334: 470960
1,1334: Ag96
0,1335: 470961
1,1335: Ag96m1
0,1336: 470970
1,1336: Ag97
0,1337: 470980
1,1337: Ag98
0,1338: 470990
1,1338: Ag99
0,1339: 470991
1,1339: Ag99m1
0,1340: 471000
1,1340: Ag100
0,1341: 471001
1,1341: Ag100m1
0,1342: 471010
1,1342: Ag101
0,1343: 471011
1,1343: Ag101m1
0,1344: 471020
1,1344: Ag102
0,1345: 471021
1,1345: Ag102m1
0,1346: 471030
1,1346: Ag103
0,1347: 471031
1,13

0,2010: 601300
1,2010: Nd130
0,2011: 601310
1,2011: Nd131
0,2012: 601320
1,2012: Nd132
0,2013: 601330
1,2013: Nd133
0,2014: 601331
1,2014: Nd133m1
0,2015: 601340
1,2015: Nd134
0,2016: 601350
1,2016: Nd135
0,2017: 601351
1,2017: Nd135m1
0,2018: 601360
1,2018: Nd136
0,2019: 601370
1,2019: Nd137
0,2020: 601371
1,2020: Nd137m1
0,2021: 601380
1,2021: Nd138
0,2022: 601390
1,2022: Nd139
0,2023: 601391
1,2023: Nd139m1
0,2024: 601400
1,2024: Nd140
0,2025: 601410
1,2025: Nd141
0,2026: 601411
1,2026: Nd141m1
0,2027: 601420
1,2027: Nd142
0,2028: 601430
1,2028: Nd143
0,2029: 601440
1,2029: Nd144
0,2030: 601450
1,2030: Nd145
0,2031: 601460
1,2031: Nd146
0,2032: 601470
1,2032: Nd147
0,2033: 601480
1,2033: Nd148
0,2034: 601490
1,2034: Nd149
0,2035: 601500
1,2035: Nd150
0,2036: 601510
1,2036: Nd151
0,2037: 601520
1,2037: Nd152
0,2038: 601530
1,2038: Nd153
0,2039: 601540
1,2039: Nd154
0,2040: 601550
1,2040: Nd155
0,2041: 601560
1,2041: Nd156
0,2042: 601570
1,2042: Nd157
0,2043: 601580
1,2043: Nd158
0,20

1,2738: Re180
0,2739: 751810
1,2739: Re181
0,2740: 751820
1,2740: Re182
0,2741: 751821
1,2741: Re182m1
0,2742: 751830
1,2742: Re183
0,2743: 751831
1,2743: Re183m1
0,2744: 751840
1,2744: Re184
0,2745: 751841
1,2745: Re184m1
0,2746: 751850
1,2746: Re185
0,2747: 751860
1,2747: Re186
0,2748: 751861
1,2748: Re186m1
0,2749: 751870
1,2749: Re187
0,2750: 751880
1,2750: Re188
0,2751: 751881
1,2751: Re188m1
0,2752: 751890
1,2752: Re189
0,2753: 751900
1,2753: Re190
0,2754: 751901
1,2754: Re190m1
0,2755: 751910
1,2755: Re191
0,2756: 751920
1,2756: Re192
0,2757: 751930
1,2757: Re193
0,2758: 751940
1,2758: Re194
0,2759: 761620
1,2759: Os162
0,2760: 761630
1,2760: Os163
0,2761: 761640
1,2761: Os164
0,2762: 761650
1,2762: Os165
0,2763: 761660
1,2763: Os166
0,2764: 761670
1,2764: Os167
0,2765: 761680
1,2765: Os168
0,2766: 761690
1,2766: Os169
0,2767: 761700
1,2767: Os170
0,2768: 761710
1,2768: Os171
0,2769: 761720
1,2769: Os172
0,2770: 761730
1,2770: Os173
0,2771: 761740
1,2771: Os174
0,2772: 761750
1,

0,3512: 922340
1,3512: U234
0,3513: 922350
1,3513: U235
0,3514: 922351
1,3514: U235m1
0,3515: 922360
1,3515: U236
0,3516: 922370
1,3516: U237
0,3517: 922380
1,3517: U238
0,3518: 922390
1,3518: U239
0,3519: 922400
1,3519: U240
0,3520: 922410
1,3520: U241
0,3521: 922420
1,3521: U242
0,3522: 932250
1,3522: Np225
0,3523: 932260
1,3523: Np226
0,3524: 932270
1,3524: Np227
0,3525: 932280
1,3525: Np228
0,3526: 932290
1,3526: Np229
0,3527: 932300
1,3527: Np230
0,3528: 932310
1,3528: Np231
0,3529: 932320
1,3529: Np232
0,3530: 932330
1,3530: Np233
0,3531: 932340
1,3531: Np234
0,3532: 932350
1,3532: Np235
0,3533: 932360
1,3533: Np236
0,3534: 932361
1,3534: Np236m1
0,3535: 932370
1,3535: Np237
0,3536: 932380
1,3536: Np238
0,3537: 932390
1,3537: Np239
0,3538: 932400
1,3538: Np240
0,3539: 932401
1,3539: Np240m1
0,3540: 932410
1,3540: Np241
0,3541: 932420
1,3541: Np242
0,3542: 932421
1,3542: Np242m1
0,3543: 932430
1,3543: Np243
0,3544: 932440
1,3544: Np244
0,3545: 942280
1,3545: Pu228
0,3546: 942290
1

In [7]:
from pycaret.anomaly import *

exp_ano = setup(nuc_data, 
                normalize = True, 
                normalize_method='robust',
                ignore_features = unnecessary_columns,
                numeric_features = numeric_columns,
                session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(3821, 654)"
2,Missing Values,False
3,Numeric Features,652
4,Categorical Features,0
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(3821, 652)"
9,CPU Jobs,-1


# Create a Model

In [31]:
iforest = create_model('iforest', n_jobs=-1, fraction=0.005, n_estimators=256)

In [22]:
iforest = create_model('iforest', n_jobs=-1, n_estimators=256)

In [32]:
iforest

IForest(behaviour='new', bootstrap=False, contamination=0.005,
    max_features=1.0, max_samples='auto', n_estimators=256, n_jobs=-1,
    random_state=123, verbose=0)

# Assign a Model

In [33]:
iforest_results = assign_model(iforest)

In [34]:
iforest_results.head()

Unnamed: 0,nuc_ix,name,mu_day_UO2Flux_CRAM_1ton_100steps_first_step,mu_day_UO2Flux_CRAM_1ton_100steps_last_step,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_1,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_2,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_3,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_4,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_5,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_6,...,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_42,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_43,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_44,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_45,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_46,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_47,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_48,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_49,Anomaly,Anomaly_Score
0,10010,H1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.437195
1,10020,H2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.437195
2,10030,H3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.437195
3,10040,H4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.437195
4,10050,H5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-0.437195


In [35]:
iforest_results[iforest_results.Anomaly == 1]

Unnamed: 0,nuc_ix,name,mu_day_UO2Flux_CRAM_1ton_100steps_first_step,mu_day_UO2Flux_CRAM_1ton_100steps_last_step,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_1,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_2,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_3,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_4,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_5,mu_day_UO2Flux_CRAM_1ton_100steps_middle_step_6,...,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_42,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_43,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_44,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_45,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_46,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_47,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_48,origin_year_UO2Flux_CRAM_1ton_50steps_middle_step_49,Anomaly,Anomaly_Score
1043,400940,Zr94,0,1,0.004894373873,0.009877568701,0.0148585005,0.01983825625,0.02481761983,0.02979715683,...,11.21862845,11.42091261,11.62240784,11.8231719,12.02325748,12.22271254,12.42158076,12.61990184,1,0.001118
1045,400960,Zr96,0,1,0.004982738235,0.009960423999,0.01493637385,0.01991187069,0.0248877903,0.02986476685,...,12.235105,12.4625765,12.68910406,12.91474517,13.13955216,13.36357265,13.58684984,13.80942296,1,0.005589
1127,420970,Mo97,0,0,0.001579656272,0.005116897465,0.009408832972,0.01397880626,0.01865222081,0.02336493696,...,12.54206897,12.7786382,13.01411139,13.24854432,13.48198783,13.71448815,13.94608729,14.17682338,1,0.004196
1128,420980,Mo98,0,0,0.004535726552,0.009070853526,0.01360539083,0.01814084092,0.02267823949,0.02721833757,...,13.41409182,13.68757743,13.96044816,14.23275401,14.50454027,14.77584796,15.04671411,15.31717208,1,0.006436
1130,421000,Mo100,0,1,0.004987680677,0.009971527322,0.01495519383,0.01994038101,0.02492826231,0.02991968798,...,15.19630024,15.50347924,15.80969216,16.1149953,16.4194397,16.72307157,17.02593262,17.32806041,1,0.017096
1208,441010,Ru101,0,0,0.004025410432,0.008173423398,0.01232113252,0.01647002759,0.02062120375,0.02477544935,...,12.13625132,12.35712199,12.57625122,12.79369753,13.00951462,13.22375174,13.436454,13.64766269,1,0.002184
1209,441020,Ru102,0,0,0.003477599034,0.006991616547,0.01050678699,0.01402467186,0.01754637328,0.02107268785,...,14.92078752,15.27945666,15.63866103,15.99842798,16.35878142,16.71974205,17.08132761,17.4435531,1,0.008682
1738,541320,Xe132,0,0,0.0002851572014,0.001169649057,0.002555890872,0.004345476059,0.006460577653,0.008838676779,...,16.2733426,16.67003596,17.06630689,17.46215666,17.85758602,18.25259514,18.64718367,19.04135069,1,0.01116
1742,541340,Xe134,0,1,0.00550872156,0.01170284235,0.01790321794,0.02411181056,0.03032968026,0.03655722092,...,18.29867219,18.66598523,19.03220322,19.39739826,19.76163572,20.12497476,20.48746882,20.84916606,1,0.019826
1746,541360,Xe136,0,1,0.006850681919,0.01530065561,0.02392396522,0.03256246927,0.0412066461,0.04985728453,...,27.58255943,28.15187723,28.7196363,29.28593345,29.85085633,30.41448411,30.97688816,31.53813263,1,0.025771


In [36]:
result_list = list(iforest_results[iforest_results.Anomaly == 1]['name'])
result_list

['Zr94',
 'Zr96',
 'Mo97',
 'Mo98',
 'Mo100',
 'Ru101',
 'Ru102',
 'Xe132',
 'Xe134',
 'Xe136',
 'Cs137',
 'Ba138',
 'La139',
 'Ce140',
 'Ce142',
 'Nd144',
 'U235',
 'U236',
 'U238',
 'Pu239']

# Plot a Model

In [37]:
plot_model(iforest)

In [38]:
plot_model(iforest, plot = 'umap')

# Saving the Model

In [29]:
save_model(iforest,'nuc_all_steps_isotope_iforest_flat_0_005_model')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['nuc_ix', 'name'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['001_first_step',
                                                           '001_last_step',
                                                           '001_middle_step_1',
                                                           '001_middle_step_2',
                                                           '001_middle_step_3',
                                                           '001_middle_step_4',
                                                           '001_middle_step_5',
                                                           '001_middle_step_6',
                                       

# Predict on Unseen Data

In [None]:
predictions = predict_model(iforest, data=nuc_data)

In [None]:
predictions.head()

In [None]:
predictions[predictions.Anomaly == 1]

# Loading the Saved Model

In [None]:
from pycaret.anomaly import *
saved_iforest = load_model('nuc_all_steps_isotope_model')

In [None]:
new_prediction = predict_model(saved_iforest, data=nuc_data)

In [None]:
new_prediction.head()

In [None]:
new_prediction[new_prediction.Label == 1]