In [1]:
# !pip install tqdm

In [2]:
# !pip install pycaret

In [3]:
# !pip install pycaret[all]

In [4]:
import pandas as pd
import numpy as np
import pickle

from tqdm import tqdm
tqdm.pandas()

import os
from pprint import pprint
import re
from collections import defaultdict

In [5]:
DATA_ADDRESS = "./data"
os.listdir(DATA_ADDRESS)

['.gitkeep',
 'correlation_df_mean_no_clean.csv',
 'correlation_df_std_no_clean.csv',
 'df_5s.pkl',
 'df_5s_source.pkl',
 'df_merged.pkl',
 'df_merged_pivot.xlsx',
 'df_processed_descriptive_statistics.pickle',
 'df_processed_descriptive_statistics.pkl',
 'df_processed_descriptive_statistics_mel40_mfcc20.pkl',
 'df_processed_simple.pkl',
 'df_processed_stat_no_trim_cleaned_fft2048_mel128_mfcc20.pkl',
 'df_processed_stat_no_trim_cleaned_fft2048_mel40_mfcc17.pkl',
 'df_processed_stat_no_trim_cleaned_fft512_mel64_mfcc17.pkl',
 'df_processed_stat_no_trim_no_clean_fft2048_mel128_mfcc20.pkl',
 'df_processed_stat_no_trim_no_clean_fft512_mel128_mfcc17.pkl',
 'preprocessed',
 'spearman_correlation_df_mean_cleaned.csv',
 'spearman_correlation_df_mean_no_clean.csv',
 'spearman_correlation_df_std_cleaned.csv',
 'spearman_correlation_df_std_no_clean.csv',
 'VOiCES_devkit']

In [6]:
TRAIN_DIR = os.path.join(DATA_ADDRESS,'preprocessed','train')
os.listdir(TRAIN_DIR)

['mel_len15_fft2048_mels128_mfcc17_0-499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1000-1499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10000-10499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10500-10999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11000-11499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11500-11999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12000-12499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12500-12799.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1500-1999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2000-2499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2500-2999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3000-3499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3500-3999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4000-4499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4500-4999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_500-999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5000-5499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5500-5999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_6000-6499.pkl',
 'mel_len15_fft2048_mels1

## Load data

In [7]:
def load_pickle(filename):
    with open(filename, 'rb') as f:
        obj = pickle.load(f)
    return obj
def load_train(train_files):
    ls_X_trian = []
    for k,v in tqdm(train_files.items()):  
        train_file = v['file_name']
        file_dir = os.path.join(TRAIN_DIR, train_file)
        ls_X_trian.append(load_pickle(file_dir))
    return np.concatenate(ls_X_trian, axis=0)

### X_train

In [8]:

file_prefix = 'mfcc_len5_fft2048_mels128_mfcc17_'

# List all files in the directory
files = os.listdir(TRAIN_DIR)
pprint(files)



['mel_len15_fft2048_mels128_mfcc17_0-499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1000-1499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10000-10499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_10500-10999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11000-11499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_11500-11999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12000-12499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_12500-12799.pkl',
 'mel_len15_fft2048_mels128_mfcc17_1500-1999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2000-2499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_2500-2999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3000-3499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_3500-3999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4000-4499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_4500-4999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_500-999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5000-5499.pkl',
 'mel_len15_fft2048_mels128_mfcc17_5500-5999.pkl',
 'mel_len15_fft2048_mels128_mfcc17_6000-6499.pkl',
 'mel_len15_fft2048_mels1

In [9]:
# Filter files that start with the specified prefix
train_files_mfcc = [file for file in files if file.startswith(file_prefix)]
pprint(train_files_mfcc)

['mfcc_len5_fft2048_mels128_mfcc17_0-499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_1000-1499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_10000-10499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_10500-10999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_11000-11499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_11500-11999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_12000-12499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_12500-12999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_13000-13499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_13500-13999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_14000-14499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_14500-14999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_1500-1999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_15000-15499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_15500-15999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_16000-16499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_16500-16999.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_17000-17499.pkl',
 'mfcc_len5_fft2048_mels128_mfcc17_17500-17999.pkl',
 'm

In [10]:
# Regular expression to match two numbers at the end of the file name
pattern = re.compile(r'_(\d+)-(\d+)\.pkl$')
n_interval = 500

# Extract the numbers
dir_files = defaultdict(dict)
for file_name in train_files_mfcc:
    match = pattern.search(file_name)
    n1 = int(match.group(1))
    n2 = int(match.group(2))
    file_num = n1/n_interval
    dir_files[file_num] = {
        'begin':n1,
        'end':n2,
        'file_name' : file_name,
    }
sorted_dir_files = {k: dir_files[k] for k in sorted(dir_files)}
pprint(sorted_dir_files)

{0.0: {'begin': 0,
       'end': 499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_0-499.pkl'},
 1.0: {'begin': 500,
       'end': 999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_500-999.pkl'},
 2.0: {'begin': 1000,
       'end': 1499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_1000-1499.pkl'},
 3.0: {'begin': 1500,
       'end': 1999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_1500-1999.pkl'},
 4.0: {'begin': 2000,
       'end': 2499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_2000-2499.pkl'},
 5.0: {'begin': 2500,
       'end': 2999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_2500-2999.pkl'},
 6.0: {'begin': 3000,
       'end': 3499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_3000-3499.pkl'},
 7.0: {'begin': 3500,
       'end': 3999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_3500-3999.pkl'},
 8.0: {'begin': 4000,
       'end': 4499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_4000-4499.pk

In [11]:
partial_dict = {k: v for k, v in sorted_dir_files.items() if 0 <= k <= 10}
pprint(partial_dict)

{0.0: {'begin': 0,
       'end': 499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_0-499.pkl'},
 1.0: {'begin': 500,
       'end': 999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_500-999.pkl'},
 2.0: {'begin': 1000,
       'end': 1499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_1000-1499.pkl'},
 3.0: {'begin': 1500,
       'end': 1999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_1500-1999.pkl'},
 4.0: {'begin': 2000,
       'end': 2499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_2000-2499.pkl'},
 5.0: {'begin': 2500,
       'end': 2999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_2500-2999.pkl'},
 6.0: {'begin': 3000,
       'end': 3499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_3000-3499.pkl'},
 7.0: {'begin': 3500,
       'end': 3999,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_3500-3999.pkl'},
 8.0: {'begin': 4000,
       'end': 4499,
       'file_name': 'mfcc_len5_fft2048_mels128_mfcc17_4000-4499.pk

In [12]:
X = load_train(partial_dict)
X.shape

100%|██████████| 11/11 [00:00<00:00, 94.18it/s]


(5500, 17, 216)

### y_train

In [13]:
df_raw = pd.read_pickle(os.path.join(DATA_ADDRESS,'df_5s.pkl'))
df_raw

Unnamed: 0,origin_folder,speaker,distractor,room,category,filename
0,distant-16k/speech/test/rm2/musi/sp6643,6643,musi,rm2,test,5seconds-16k/speech/test/rm2/musi/sp6643/Lab41...
1,distant-16k/speech/test/rm2/musi/sp6643,6643,musi,rm2,test,5seconds-16k/speech/test/rm2/musi/sp6643/Lab41...
2,distant-16k/speech/test/rm2/musi/sp6643,6643,musi,rm2,test,5seconds-16k/speech/test/rm2/musi/sp6643/Lab41...
3,distant-16k/speech/test/rm2/musi/sp6643,6643,musi,rm2,test,5seconds-16k/speech/test/rm2/musi/sp6643/Lab41...
4,distant-16k/speech/test/rm2/musi/sp6643,6643,musi,rm2,test,5seconds-16k/speech/test/rm2/musi/sp6643/Lab41...
...,...,...,...,...,...,...
74459,distant-16k/speech/test/rm3/tele/sp0166,0166,tele,rm3,test,5seconds-16k/speech/test/rm3/tele/sp0166/Lab41...
74460,distant-16k/speech/test/rm3/tele/sp0166,0166,tele,rm3,test,5seconds-16k/speech/test/rm3/tele/sp0166/Lab41...
74461,distant-16k/speech/test/rm3/tele/sp0166,0166,tele,rm3,test,5seconds-16k/speech/test/rm3/tele/sp0166/Lab41...
74462,distant-16k/speech/test/rm3/tele/sp0166,0166,tele,rm3,test,5seconds-16k/speech/test/rm3/tele/sp0166/Lab41...


In [14]:
y = np.array(df_raw[df_raw['category']=='train']['speaker'][:5500])
print(y.shape)
y

(5500,)


array(['1961', '1961', '1961', ..., '6319', '6319', '6319'], dtype=object)

In [16]:
len(set(y))

166

## Modeling

### pycaret

In [15]:
# Flatten X
X = X.reshape(X.shape[0], -1)
X.shape

(5500, 3672)

In [16]:
# Convert the flattened_array to a pandas DataFrame
X_df = pd.DataFrame(X)

# Convert the 1D array y to a pandas Series
y_series = pd.Series(y, name='target')

# Concatenate the features and target into a single DataFrame
data = pd.concat([X_df, y_series], axis=1)

print(data.shape)
# Check the first few rows of the DataFrame
print(data.head())

(5500, 3673)
           0          1         2          3          4          5          6  \
0 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
1 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
2 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
3 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
4 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   

           7          8          9  ...      3663      3664      3665  \
0 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
1 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
2 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
3 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
4 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   

       3666      3667      3668      3669      3670      3671

In [17]:
from pycaret.classification import *
# Initialize the setup

In [18]:
clf_setup = setup(data=data, target='target', session_id=123, use_gpu = True)


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Multiclass
3,Target mapping,"0032: 0, 0083: 1, 0093: 2, 0112: 3, 0122: 4, 0150: 5, 0159: 6, 0174: 7, 0188: 8, 0196: 9, 0198: 10, 0205: 11, 0208: 12, 0209: 13, 0224: 14, 0226: 15, 0242: 16, 0250: 17, 0254: 18, 0296: 19, 0472: 20, 0479: 21, 0480: 22, 0492: 23, 0510: 24, 0597: 25, 0636: 26, 0637: 27, 0652: 28, 0770: 29, 0868: 30, 0882: 31, 0887: 32, 0948: 33, 0949: 34, 1052: 35, 1066: 36, 1112: 37, 1116: 38, 1121: 39, 1182: 40, 1212: 41, 1235: 42, 1246: 43, 1259: 44, 1271: 45, 1272: 46, 1335: 47, 1392: 48, 1417: 49, 1472: 50, 1536: 51, 1607: 52, 1737: 53, 1841: 54, 1851: 55, 1874: 56, 1926: 57, 1961: 58, 1963: 59, 1970: 60, 2012: 61, 2060: 62, 2074: 63, 2149: 64, 2156: 65, 2162: 66, 2269: 67, 2285: 68, 2289: 69, 2294: 70, 2481: 71, 2573: 72, 2758: 73, 2764: 74, 2803: 75, 2911: 76, 3235: 77, 3368: 78, 3446: 79, 3483: 80, 3521: 81, 3645: 82, 3835: 83, 3923: 84, 3972: 85, 3994: 86, 4010: 87, 4014: 88, 4057: 89, 4116: 90, 4145: 91, 4160: 92, 4331: 93, 4427: 94, 4438: 95, 4441: 96, 4535: 97, 4586: 98, 4590: 99, 4744: 100, 4839: 101, 4848: 102, 4859: 103, 4957: 104, 5126: 105, 5154: 106, 5157: 107, 5189: 108, 5400: 109, 5401: 110, 5635: 111, 5678: 112, 5717: 113, 5740: 114, 5789: 115, 5802: 116, 5868: 117, 5935: 118, 5968: 119, 6099: 120, 6147: 121, 6241: 122, 6319: 123, 6385: 124, 6395: 125, 6454: 126, 6519: 127, 6544: 128, 6574: 129, 6696: 130, 6788: 131, 6895: 132, 6965: 133, 7000: 134, 7095: 135, 7247: 136, 7264: 137, 7276: 138, 7278: 139, 7445: 140, 7498: 141, 7517: 142, 7688: 143, 7704: 144, 7850: 145, 7867: 146, 7868: 147, 7881: 148, 7932: 149, 7976: 150, 7981: 151, 7995: 152, 8051: 153, 8057: 154, 8108: 155, 8118: 156, 8152: 157, 8222: 158, 8225: 159, 8266: 160, 8425: 161, 8575: 162, 8605: 163, 8677: 164, 8713: 165"
4,Original data shape,"(5500, 3673)"
5,Transformed data shape,"(5500, 3673)"
6,Transformed train set shape,"(3849, 3673)"
7,Transformed test set shape,"(1651, 3673)"
8,Numeric features,3672
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

In [19]:
# This function trains and evaluates different models using cross-validation and ranks them
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,1.0,0.7,1.0,1.0,1.0,1.0,1.0,103.552
knn,K Neighbors Classifier,1.0,0.7,1.0,1.0,1.0,1.0,1.0,0.87
dt,Decision Tree Classifier,1.0,0.7,1.0,1.0,1.0,1.0,1.0,10.313
ridge,Ridge Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,3.073
rf,Random Forest Classifier,1.0,0.7,1.0,1.0,1.0,1.0,1.0,1.658
gbc,Gradient Boosting Classifier,1.0,0.7,1.0,1.0,1.0,1.0,1.0,3169.914
et,Extra Trees Classifier,1.0,0.7,1.0,1.0,1.0,1.0,1.0,0.675
xgboost,Extreme Gradient Boosting,1.0,0.7,1.0,1.0,1.0,1.0,1.0,51.152
catboost,CatBoost Classifier,1.0,0.7,1.0,1.0,1.0,1.0,1.0,517.285
lda,Linear Discriminant Analysis,0.9984,0.7,0.9984,0.9993,0.9986,0.9984,0.9984,11.233


In [20]:
print("Done")

Done


In [22]:
X = load_train(sorted_dir_files)
print(X.shape)
X = X.reshape(X.shape[0], -1)
print(X.shape)
y = np.array(df_raw[df_raw['category']=='train']['speaker'])
print(y.shape)
# Convert the flattened_array to a pandas DataFrame
X_df = pd.DataFrame(X)

# Convert the 1D array y to a pandas Series
y_series = pd.Series(y, name='target')

# Concatenate the features and target into a single DataFrame
data = pd.concat([X_df, y_series], axis=1)

print(data.shape)
# Check the first few rows of the DataFrame
print(data.head())
clf_setup = setup(data=data, target='target', session_id=456, use_gpu = True)


100%|██████████| 100/100 [00:00<00:00, 461.68it/s]


(49664, 17, 216)
(49664, 3672)
(49664,)
(49664, 3673)
           0          1         2          3          4          5          6  \
0 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
1 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
2 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
3 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   
4 -54.393974 -52.948799 -54.70047 -54.836155 -55.810272 -56.069416 -56.024067   

           7          8          9  ...      3663      3664      3665  \
0 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
1 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
2 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
3 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   
4 -56.485073 -56.541992 -56.983311  ...  0.023779  0.023779  0.023779   

       3666      366

Unnamed: 0,Description,Value
0,Session id,456
1,Target,target
2,Target type,Multiclass
3,Target mapping,"0032: 0, 0083: 1, 0093: 2, 0112: 3, 0122: 4, 0150: 5, 0159: 6, 0174: 7, 0188: 8, 0196: 9, 0198: 10, 0204: 11, 0205: 12, 0208: 13, 0209: 14, 0224: 15, 0226: 16, 0240: 17, 0242: 18, 0248: 19, 0250: 20, 0254: 21, 0288: 22, 0296: 23, 0307: 24, 0403: 25, 0459: 26, 0472: 27, 0479: 28, 0480: 29, 0492: 30, 0510: 31, 0597: 32, 0636: 33, 0637: 34, 0652: 35, 0770: 36, 0868: 37, 0882: 38, 0887: 39, 0948: 40, 0949: 41, 1050: 42, 1052: 43, 1066: 44, 1112: 45, 1116: 46, 1121: 47, 1160: 48, 1182: 49, 1212: 50, 1235: 51, 1246: 52, 1259: 53, 1271: 54, 1272: 55, 1335: 56, 1383: 57, 1392: 58, 1417: 59, 1425: 60, 1472: 61, 1536: 62, 1607: 63, 1737: 64, 1841: 65, 1851: 66, 1867: 67, 1874: 68, 1926: 69, 1961: 70, 1963: 71, 1970: 72, 2012: 73, 2060: 74, 2074: 75, 2093: 76, 2110: 77, 2149: 78, 2156: 79, 2162: 80, 2269: 81, 2285: 82, 2289: 83, 2294: 84, 2412: 85, 2481: 86, 2532: 87, 2573: 88, 2673: 89, 2691: 90, 2758: 91, 2764: 92, 2803: 93, 2911: 94, 3235: 95, 3368: 96, 3446: 97, 3483: 98, 3521: 99, 3549: 100, 3645: 101, 3835: 102, 3923: 103, 3972: 104, 3989: 105, 3994: 106, 4010: 107, 4014: 108, 4057: 109, 4064: 110, 4110: 111, 4116: 112, 4145: 113, 4160: 114, 4331: 115, 4427: 116, 4438: 117, 4441: 118, 4535: 119, 4586: 120, 4590: 121, 4744: 122, 4839: 123, 4848: 124, 4859: 125, 4957: 126, 4967: 127, 5126: 128, 5154: 129, 5157: 130, 5189: 131, 5319: 132, 5338: 133, 5386: 134, 5400: 135, 5401: 136, 5456: 137, 5583: 138, 5635: 139, 5678: 140, 5717: 141, 5740: 142, 5789: 143, 5802: 144, 5868: 145, 5935: 146, 5968: 147, 6099: 148, 6147: 149, 6241: 150, 6319: 151, 6385: 152, 6395: 153, 6415: 154, 6454: 155, 6519: 156, 6544: 157, 6574: 158, 6696: 159, 6788: 160, 6848: 161, 6895: 162, 6965: 163, 7000: 164, 7095: 165, 7148: 166, 7247: 167, 7264: 168, 7276: 169, 7278: 170, 7445: 171, 7498: 172, 7517: 173, 7540: 174, 7688: 175, 7704: 176, 7850: 177, 7867: 178, 7868: 179, 7881: 180, 7910: 181, 7932: 182, 7976: 183, 7981: 184, 7995: 185, 8051: 186, 8057: 187, 8108: 188, 8118: 189, 8152: 190, 8222: 191, 8225: 192, 8266: 193, 8425: 194, 8575: 195, 8605: 196, 8635: 197, 8677: 198, 8713: 199"
4,Original data shape,"(49664, 3673)"
5,Transformed data shape,"(49664, 3673)"
6,Transformed train set shape,"(34764, 3673)"
7,Transformed test set shape,"(14900, 3673)"
8,Numeric features,3672
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

In [23]:
# This function trains and evaluates different models using cross-validation and ranks them
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,1.0,675.012
knn,K Neighbors Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.826
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,26.578
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.177
qda,Quadratic Discriminant Analysis,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.932
ridge,Ridge Classifier,0.9991,0.0,0.9991,0.9992,0.9991,0.9991,0.9991,27.662
svm,SVM - Linear Kernel,0.9789,0.0,0.9789,0.9792,0.9756,0.9788,0.9789,47.151
nb,Naive Bayes,0.9604,0.9831,0.9604,0.9637,0.9529,0.9602,0.9604,34.117
ada,Ada Boost Classifier,0.0269,0.7553,0.0269,0.0173,0.0174,0.022,0.0305,124.276


Processing:   0%|          | 0/69 [00:00<?, ?it/s]