In [1]:
# !pip install tqdm

In [2]:
# !pip install pycaret

In [3]:
# !pip install pycaret[all]

In [4]:
import pandas as pd
import numpy as np
import pickle

from tqdm import tqdm
tqdm.pandas()

import os
from pprint import pprint
import re
from collections import defaultdict

In [5]:
from datetime import datetime

print("Begining Time:\n", datetime.now())

Begining Time:
 2024-02-10 13:11:00.273332


In [6]:
DATA_ADDRESS = "./data"
MODEL_DIR = "./model/test_source"
os.listdir(DATA_ADDRESS)

['.gitkeep',
 'correlation_df_mean_no_clean.csv',
 'correlation_df_std_no_clean.csv',
 'df_5s.pkl',
 'df_5s_source.pkl',
 'df_merged.pkl',
 'df_merged_pivot.xlsx',
 'df_processed_descriptive_statistics.pickle',
 'df_processed_descriptive_statistics.pkl',
 'df_processed_descriptive_statistics_mel40_mfcc20.pkl',
 'df_processed_simple.pkl',
 'df_processed_stat_no_trim_cleaned_fft2048_mel128_mfcc20.pkl',
 'df_processed_stat_no_trim_cleaned_fft2048_mel40_mfcc17.pkl',
 'df_processed_stat_no_trim_cleaned_fft512_mel64_mfcc17.pkl',
 'df_processed_stat_no_trim_no_clean_fft2048_mel128_mfcc20.pkl',
 'df_processed_stat_no_trim_no_clean_fft512_mel128_mfcc17.pkl',
 'preprocessed',
 'spearman_correlation_df_mean_cleaned.csv',
 'spearman_correlation_df_mean_no_clean.csv',
 'spearman_correlation_df_std_cleaned.csv',
 'spearman_correlation_df_std_no_clean.csv',
 'VOiCES_devkit']

In [7]:
TRAIN_DIR = os.path.join(DATA_ADDRESS,'preprocessed','train')
os.listdir(TRAIN_DIR)

['mel_len15_fft2048_mels128_mfcc17_0-499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1000-1499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10000-10499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10500-10999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11000-11499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11500-11999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12000-12499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12500-12799.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1500-1999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2000-2499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2500-2999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3000-3499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3500-3999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4000-4499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4500-4999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_500-999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5000-5499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5500-5999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_6000-6499.pkl',
 'mel_len15_fft2048_mels1

## Load data

In [8]:
def load_pickle(filename):
    with open(filename, 'rb') as f:
        obj = pickle.load(f)
    return obj
def load_train(train_files):
    ls_X_trian = []
    for k,v in tqdm(train_files.items()):  
        train_file = v['file_name']
        file_dir = os.path.join(TRAIN_DIR, train_file)
        ls_X_trian.append(load_pickle(file_dir))
    return np.concatenate(ls_X_trian, axis=0)

### X_train

In [9]:

file_prefix = 'mfcc_len5_fft2048_mels128_mfcc17_'

# List all files in the directory
files = os.listdir(TRAIN_DIR)
pprint(files)



['mel_len15_fft2048_mels128_mfcc17_0-499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1000-1499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10000-10499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10500-10999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11000-11499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11500-11999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12000-12499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12500-12799.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1500-1999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2000-2499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2500-2999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3000-3499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3500-3999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4000-4499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4500-4999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_500-999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5000-5499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5500-5999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_6000-6499.pkl',
 'mel_len15_fft2048_mels1

In [10]:
# Filter files that start with the specified prefix
train_files_mfcc = [file for file in files if file.startswith(file_prefix)]
pprint(train_files_mfcc)

['mfcc_len5_fft2048_mels128_mfcc17_0-499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_1000-1499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_10000-10499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_10500-10999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_11000-11499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_11500-11999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_12000-12499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_12500-12999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_13000-13499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_13500-13999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_14000-14499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_14500-14999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_1500-1999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_15000-15499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_15500-15999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_16000-16499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_16500-16999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_17000-17499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_17500-17999.pkl',
 'm

In [11]:
# Regular expression to match two numbers at the end of the file name
pattern = re.compile(r'_(\d+)-(\d+)\.pkl$')
n_interval = 500

# Extract the numbers
dir_files = defaultdict(dict)
for file_name in train_files_mfcc:
    match = pattern.search(file_name)
    n1 = int(match.group(1))
    n2 = int(match.group(2))
    file_num = n1/n_interval
    dir_files[file_num] = {
        'begin':n1,
        'end':n2,
        'file_name' : file_name,
    }
sorted_dir_files = {k: dir_files[k] for k in sorted(dir_files)}
pprint(sorted_dir_files)

{0.0: {'begin': 0,
       'end': 499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_0-499.pkl'},
 1.0: {'begin': 500,
       'end': 999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_500-999.pkl'},
 2.0: {'begin': 1000,
       'end': 1499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_1000-1499.pkl'},
 3.0: {'begin': 1500,
       'end': 1999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_1500-1999.pkl'},
 4.0: {'begin': 2000,
       'end': 2499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_2000-2499.pkl'},
 5.0: {'begin': 2500,
       'end': 2999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_2500-2999.pkl'},
 6.0: {'begin': 3000,
       'end': 3499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_3000-3499.pkl'},
 7.0: {'begin': 3500,
       'end': 3999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_3500-3999.pkl'},
 8.0: {'begin': 4000,
       'end': 4499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_4000-4499.pk

In [12]:
X = load_train(sorted_dir_files)
X.shape

100%|██████████| 100/100 [00:00<00:00, 404.64it/s]


(49664, 17, 216)

### y_train

In [13]:
df_raw = pd.read_pickle(os.path.join(DATA_ADDRESS,'df_5s.pkl'))
df_raw

Unnamed: 0,origin_folder,speaker,distractor,room,category,filename
0,distant-16k/speech/test/rm3/musi/sp0192,0192,musi,rm3,test,5seconds-16k/speech/test/rm3/musi/sp0192/Lab41...
1,distant-16k/speech/test/rm3/musi/sp0192,0192,musi,rm3,test,5seconds-16k/speech/test/rm3/musi/sp0192/Lab41...
2,distant-16k/speech/test/rm3/musi/sp0192,0192,musi,rm3,test,5seconds-16k/speech/test/rm3/musi/sp0192/Lab41...
3,distant-16k/speech/test/rm3/musi/sp0192,0192,musi,rm3,test,5seconds-16k/speech/test/rm3/musi/sp0192/Lab41...
4,distant-16k/speech/test/rm3/musi/sp0192,0192,musi,rm3,test,5seconds-16k/speech/test/rm3/musi/sp0192/Lab41...
...,...,...,...,...,...,...
74459,distant-16k/speech/test/rm2/babb/sp2137,2137,babb,rm2,test,5seconds-16k/speech/test/rm2/babb/sp2137/Lab41...
74460,distant-16k/speech/test/rm2/babb/sp2137,2137,babb,rm2,test,5seconds-16k/speech/test/rm2/babb/sp2137/Lab41...
74461,distant-16k/speech/test/rm2/babb/sp2137,2137,babb,rm2,test,5seconds-16k/speech/test/rm2/babb/sp2137/Lab41...
74462,distant-16k/speech/test/rm2/babb/sp2137,2137,babb,rm2,test,5seconds-16k/speech/test/rm2/babb/sp2137/Lab41...


In [14]:
df_raw[(df_raw['category'] == 'train')]['speaker'].unique()

array(['1121', '6848', '7095', '5789', '1050', '6788', '0032', '4331',
       '8152', '3235', '4116', '3994', '0208', '0492', '5338', '8222',
       '1851', '7445', '7540', '2289', '2060', '0240', '1867', '0083',
       '5740', '4848', '1259', '3521', '2764', '1116', '0224', '2074',
       '6319', '0403', '8108', '1963', '3368', '0480', '0250', '5802',
       '1182', '0652', '5157', '0296', '7276', '0472', '4160', '0868',
       '0887', '5678', '0196', '6395', '4441', '5583', '0254', '0188',
       '7704', '4057', '2803', '4859', '1066', '8425', '7867', '6147',
       '0637', '0198', '2269', '4590', '8266', '0949', '2149', '7264',
       '5935', '7868', '2162', '3645', '0204', '7000', '5968', '7688',
       '2110', '5635', '1961', '0209', '1737', '1160', '4586', '1607',
       '4967', '8057', '0248', '8635', '0205', '0948', '0770', '6099',
       '1383', '7517', '4535', '0288', '5400', '1472', '1392', '4839',
       '5456', '1052', '8051', '0174', '3835', '0307', '4110', '6696',
      

In [15]:
df_raw[(df_raw['category'] == 'train')]['speaker'].unique()[9]

'3235'

In [16]:
df_raw[(df_raw['category'] == 'train')& (df_raw['speaker'] == '3235')]

Unnamed: 0,origin_folder,speaker,distractor,room,category,filename
280,distant-16k/speech/train/rm3/musi/sp3235,3235,musi,rm3,train,5seconds-16k/speech/train/rm3/musi/sp3235/Lab4...
281,distant-16k/speech/train/rm3/musi/sp3235,3235,musi,rm3,train,5seconds-16k/speech/train/rm3/musi/sp3235/Lab4...
282,distant-16k/speech/train/rm3/musi/sp3235,3235,musi,rm3,train,5seconds-16k/speech/train/rm3/musi/sp3235/Lab4...
283,distant-16k/speech/train/rm3/musi/sp3235,3235,musi,rm3,train,5seconds-16k/speech/train/rm3/musi/sp3235/Lab4...
284,distant-16k/speech/train/rm3/musi/sp3235,3235,musi,rm3,train,5seconds-16k/speech/train/rm3/musi/sp3235/Lab4...
...,...,...,...,...,...,...
72831,distant-16k/speech/train/rm4/none/sp3235,3235,none,rm4,train,5seconds-16k/speech/train/rm4/none/sp3235/Lab4...
72832,distant-16k/speech/train/rm4/none/sp3235,3235,none,rm4,train,5seconds-16k/speech/train/rm4/none/sp3235/Lab4...
72833,distant-16k/speech/train/rm4/none/sp3235,3235,none,rm4,train,5seconds-16k/speech/train/rm4/none/sp3235/Lab4...
72834,distant-16k/speech/train/rm4/none/sp3235,3235,none,rm4,train,5seconds-16k/speech/train/rm4/none/sp3235/Lab4...


In [17]:
df_raw[df_raw['category'] == 'train']

Unnamed: 0,origin_folder,speaker,distractor,room,category,filename
30,distant-16k/speech/train/rm4/musi/sp1121,1121,musi,rm4,train,5seconds-16k/speech/train/rm4/musi/sp1121/Lab4...
31,distant-16k/speech/train/rm4/musi/sp1121,1121,musi,rm4,train,5seconds-16k/speech/train/rm4/musi/sp1121/Lab4...
32,distant-16k/speech/train/rm4/musi/sp1121,1121,musi,rm4,train,5seconds-16k/speech/train/rm4/musi/sp1121/Lab4...
33,distant-16k/speech/train/rm4/musi/sp1121,1121,musi,rm4,train,5seconds-16k/speech/train/rm4/musi/sp1121/Lab4...
34,distant-16k/speech/train/rm4/musi/sp1121,1121,musi,rm4,train,5seconds-16k/speech/train/rm4/musi/sp1121/Lab4...
...,...,...,...,...,...,...
74443,distant-16k/speech/train/rm3/tele/sp2764,2764,tele,rm3,train,5seconds-16k/speech/train/rm3/tele/sp2764/Lab4...
74444,distant-16k/speech/train/rm3/tele/sp2764,2764,tele,rm3,train,5seconds-16k/speech/train/rm3/tele/sp2764/Lab4...
74445,distant-16k/speech/train/rm3/tele/sp2764,2764,tele,rm3,train,5seconds-16k/speech/train/rm3/tele/sp2764/Lab4...
74446,distant-16k/speech/train/rm3/tele/sp2764,2764,tele,rm3,train,5seconds-16k/speech/train/rm3/tele/sp2764/Lab4...


In [18]:
y = np.array(df_raw[df_raw['category']=='train']['speaker']).astype('float32')
print(y.shape)
y

(49664,)


array([1121., 1121., 1121., ..., 2764., 2764., 2764.], dtype=float32)

In [19]:
X = X[:1000]
y = y[:1000]

In [20]:
unique, counts = np.unique(y, return_counts=True)
print(len(unique))
dict(zip(unique, counts))


55


{32.0: 32,
 83.0: 16,
 196.0: 16,
 208.0: 28,
 224.0: 16,
 240.0: 16,
 250.0: 16,
 254.0: 16,
 296.0: 16,
 403.0: 16,
 472.0: 14,
 480.0: 16,
 492.0: 32,
 652.0: 20,
 868.0: 16,
 887.0: 16,
 1050.0: 14,
 1116.0: 16,
 1121.0: 16,
 1182.0: 16,
 1259.0: 24,
 1851.0: 16,
 1867.0: 16,
 1963.0: 32,
 2060.0: 16,
 2074.0: 16,
 2289.0: 32,
 2764.0: 16,
 3235.0: 16,
 3368.0: 28,
 3521.0: 16,
 3994.0: 16,
 4116.0: 16,
 4160.0: 16,
 4331.0: 16,
 4441.0: 14,
 4848.0: 16,
 5157.0: 16,
 5338.0: 20,
 5583.0: 16,
 5678.0: 14,
 5740.0: 32,
 5789.0: 16,
 5802.0: 16,
 6319.0: 12,
 6395.0: 16,
 6788.0: 32,
 6848.0: 16,
 7095.0: 14,
 7276.0: 16,
 7445.0: 16,
 7540.0: 16,
 8108.0: 14,
 8152.0: 16,
 8222.0: 16}

## Modeling

### pycaret

In [21]:
# Flatten X
X = X.reshape(X.shape[0], -1)
X.shape

(1000, 3672)

In [22]:
# Convert the flattened_array to a pandas DataFrame
X_df = pd.DataFrame(X)

# Convert the 1D array y to a pandas Series
y_series = pd.Series(y, name='target')

# Concatenate the features and target into a single DataFrame
data = pd.concat([X_df, y_series], axis=1)

print(data.shape)
# Check the first few rows of the DataFrame
print(data.head())

(1000, 3673)
           0          1          2          3          4          5  \
0 -80.977600 -80.419434 -79.915077 -79.821449 -79.740036 -79.881989   
1 -75.017464 -71.060478 -65.008240 -60.050758 -56.970104 -56.826725   
2 -83.417419 -83.293800 -83.408394 -83.713020 -83.887047 -83.888329   
3 -71.465614 -66.708954 -63.589741 -61.603786 -59.245041 -58.487385   
4 -55.037834 -53.793888 -54.851269 -57.155281 -59.371613 -61.002224   

           6          7          8          9  ...      3663      3664  \
0 -80.015327 -80.039238 -80.408401 -80.696159  ...  0.789637  0.596668   
1 -58.076405 -61.482307 -66.518311 -72.484703  ... -0.306891 -0.246717   
2 -83.877029 -83.771271 -83.767853 -83.794540  ... -0.791701  0.350824   
3 -57.167061 -53.935825 -51.339558 -49.302170  ...  0.023778  0.023779   
4 -61.387573 -63.297401 -64.819847 -66.793404  ... -1.076139 -0.573394   

       3665      3666      3667      3668      3669      3670      3671  \
0  0.574223 -0.277681 -1.604426 -2.58122

In [23]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3663,3664,3665,3666,3667,3668,3669,3670,3671,target
0,-80.977600,-80.419434,-79.915077,-79.821449,-79.740036,-79.881989,-80.015327,-80.039238,-80.408401,-80.696159,...,0.789637,0.596668,0.574223,-0.277681,-1.604426,-2.581222,-2.807574,-1.728784,-0.533316,1121.0
1,-75.017464,-71.060478,-65.008240,-60.050758,-56.970104,-56.826725,-58.076405,-61.482307,-66.518311,-72.484703,...,-0.306891,-0.246717,-0.213964,0.116363,0.144263,0.032544,0.068943,0.166217,0.220161,1121.0
2,-83.417419,-83.293800,-83.408394,-83.713020,-83.887047,-83.888329,-83.877029,-83.771271,-83.767853,-83.794540,...,-0.791701,0.350824,0.193292,0.363183,0.191189,0.263955,-0.255743,-0.354151,-0.447814,1121.0
3,-71.465614,-66.708954,-63.589741,-61.603786,-59.245041,-58.487385,-57.167061,-53.935825,-51.339558,-49.302170,...,0.023778,0.023779,0.023779,0.023779,0.023779,0.023779,0.023779,0.023779,0.023779,1121.0
4,-55.037834,-53.793888,-54.851269,-57.155281,-59.371613,-61.002224,-61.387573,-63.297401,-64.819847,-66.793404,...,-1.076139,-0.573394,-0.198749,-0.536650,-0.596209,-0.691602,-1.221938,-1.176664,-1.081235,1121.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-74.594246,-74.317101,-71.101364,-69.452629,-70.257950,-70.016006,-66.025665,-62.365822,-60.433369,-60.344501,...,0.023778,0.023779,0.023779,0.023779,0.023779,0.023779,0.023779,0.023779,0.023779,1259.0
996,-82.490181,-81.890045,-81.866890,-81.974632,-82.603989,-83.235832,-83.405220,-83.286507,-83.094582,-83.006981,...,-1.069545,-1.047240,-1.074312,-0.726048,-0.739538,-0.749214,-0.619689,-0.620974,-0.459634,1259.0
997,-82.516220,-82.120750,-82.185616,-82.417892,-82.490334,-82.691315,-83.212234,-83.725624,-83.926682,-83.907372,...,0.332243,0.020480,0.281064,-0.106530,0.226446,0.457334,0.220347,0.316990,0.570863,1259.0
998,-83.437042,-83.340546,-83.571854,-83.952591,-83.587822,-83.017494,-82.381248,-82.303284,-82.389580,-82.361732,...,0.455865,-0.181465,-0.323267,-0.321774,-0.443202,-0.622815,-0.495281,0.000792,0.597754,1259.0


In [24]:
from pycaret.classification import *
# Initialize the setup

In [25]:
clf_setup = setup(data=data, target='target', session_id=42, use_gpu = True,system_log=True,fold=5)


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

Unnamed: 0,Description,Value
0,Session id,42
1,Target,target
2,Target type,Multiclass
3,Target mapping,"32.0: 0, 83.0: 1, 196.0: 2, 208.0: 3, 224.0: 4, 240.0: 5, 250.0: 6, 254.0: 7, 296.0: 8, 403.0: 9, 472.0: 10, 480.0: 11, 492.0: 12, 652.0: 13, 868.0: 14, 887.0: 15, 1050.0: 16, 1116.0: 17, 1121.0: 18, 1182.0: 19, 1259.0: 20, 1851.0: 21, 1867.0: 22, 1963.0: 23, 2060.0: 24, 2074.0: 25, 2289.0: 26, 2764.0: 27, 3235.0: 28, 3368.0: 29, 3521.0: 30, 3994.0: 31, 4116.0: 32, 4160.0: 33, 4331.0: 34, 4441.0: 35, 4848.0: 36, 5157.0: 37, 5338.0: 38, 5583.0: 39, 5678.0: 40, 5740.0: 41, 5789.0: 42, 5802.0: 43, 6319.0: 44, 6395.0: 45, 6788.0: 46, 6848.0: 47, 7095.0: 48, 7276.0: 49, 7445.0: 50, 7540.0: 51, 8108.0: 52, 8152.0: 53, 8222.0: 54"
4,Original data shape,"(1000, 3673)"
5,Transformed data shape,"(1000, 3673)"
6,Transformed train set shape,"(700, 3673)"
7,Transformed test set shape,"(300, 3673)"
8,Numeric features,3672
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

In [26]:
print("Model Start Time:\n", datetime.now())

Model Start Time:
 2024-02-10 13:11:10.061026


In [27]:
# This function trains and evaluates different models using cross-validation and ranks them
best_model = compare_models(n_select = 10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.6886,0.9255,0.6886,0.703,0.6644,0.6819,0.6836,21.624
et,Extra Trees Classifier,0.6571,0.8851,0.6571,0.6738,0.6237,0.6491,0.6521,0.416
catboost,CatBoost Classifier,0.6571,0.906,0.6571,0.6716,0.6273,0.6491,0.6523,244.232
ridge,Ridge Classifier,0.6271,0.0,0.6271,0.6454,0.597,0.6188,0.6217,0.188
rf,Random Forest Classifier,0.5929,0.8751,0.5929,0.5674,0.5452,0.5831,0.5867,1.376
lda,Linear Discriminant Analysis,0.5929,0.9103,0.5929,0.6132,0.5649,0.5839,0.5866,1.126
nb,Naive Bayes,0.4943,0.8037,0.4943,0.6294,0.5115,0.4828,0.4953,0.59
lightgbm,Light Gradient Boosting Machine,0.4843,0.8765,0.4843,0.468,0.4501,0.4731,0.4746,175.002
xgboost,Extreme Gradient Boosting,0.4186,0.8542,0.4186,0.3908,0.3837,0.406,0.4076,29.154
svm,SVM - Linear Kernel,0.3171,0.0,0.3171,0.2957,0.2614,0.3027,0.3143,0.39


In [28]:
print("Model End Time:\n", datetime.now())

Model End Time:
 2024-02-10 18:26:24.535880


In [29]:
for model in best_model:
    print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=42, verbose=0, warm_start=False)
<catboost.core.CatBoostClassifier object at 0x00000177C556B5B0>
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, ran

In [30]:
for model in best_model:
    print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=42, verbose=0, warm_start=False)
<catboost.core.CatBoostClassifier object at 0x00000177C556B5B0>
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, ran

In [31]:
len(best_model)

10

In [32]:
evaluate_model(best_model[0])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [33]:
evaluate_model(best_model[1])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [34]:
evaluate_model(best_model[2])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [35]:
for i,model in enumerate(best_model):
    path = os.path.join(MODEL_DIR,f'test_source_{i}')
    save_model(model,path)


Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


In [36]:
# https://pycaret.gitbook.io/docs/get-started/functions/deploy#save_experiment
path = os.path.join(MODEL_DIR,f'experiment_{int(datetime.now().timestamp())}')
save_experiment(path)

In [37]:
print("Done:\n", datetime.now())

Done:
 2024-02-10 18:26:28.258652
