In [1]:
import os
import pandas as pd
from xgboost import XGBClassifier
from method import analyze_features
from data_preprocess import *
from cross_validate import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
root = '/home/kirb/work/ISP-projects/xgboost_diploma/'

In [3]:
law_nan_features, law_zero_features, law_100_features, law_1000_features, law_10000_features = analyze_features(root, 'law', 5)

100%|█████████▉| 1499/1500 [00:00<00:00, 11864.66it/s]
100%|█████████▉| 1499/1500 [00:00<00:00, 10568.71it/s]
100%|██████████| 1500/1500 [02:27<00:00, 10.19it/s]
100%|██████████| 1500/1500 [00:36<00:00, 41.04it/s]


In [5]:
law_nan_features

['current_regexp',
 'current_regexp_next_3',
 'current_regexp_prev_3',
 'endswith_colon_next_1',
 'endswith_comma',
 'endswith_comma_prev_1',
 'endswith_comma_prev_2',
 'roman_regexp_next_1',
 'roman_regexp_next_2',
 'roman_regexp_next_3',
 'start_regexp_0_subitem_next_1',
 'start_regexp_0_subitem_next_2',
 'start_regexp_0_subitem_next_3',
 'start_regexp_0_subitem_prev_1',
 'start_regexp_0_subitem_prev_2',
 'start_regexp_0_subitem_prev_3',
 'start_regexp_1_item',
 'start_regexp_1_item_next_1',
 'start_regexp_1_item_prev_1',
 'start_regexp_1_item_prev_3',
 'start_regexp_num_matches_named',
 'start_regexp_num_matches_named_next_1',
 'start_regexp_num_matches_named_next_2',
 'start_regexp_num_matches_named_next_3',
 'start_regexp_num_matches_named_prev_2',
 'start_regexp_num_matches_subitem',
 'start_regexp_num_matches_subitem_next_1',
 'start_regexp_num_matches_subitem_next_2',
 'start_regexp_num_matches_subitem_next_3',
 'start_regexp_num_matches_subitem_prev_1',
 'start_regexp_num_matc

In [6]:
law_zero_features

['current_regexp_next_2',
 'regexp_application_begin_next_1',
 'regexp_application_begin_next_2',
 'startswith_quote_next_2']

In [7]:
law_100_features

['current_regexp_prev_1',
 'endswith_colon_prev_1',
 'prev_is_space',
 'regexp_application_begin_next_3',
 'subitem_regexp_len_next_3',
 'subitem_regexp_len_prev_2']

In [4]:
tz_nan_features, tz_zero_features, tz_100_features, tz_1000_features, tz_10000_features = analyze_features(root, 'tz', 5)

100%|█████████▉| 1499/1500 [00:00<00:00, 17283.05it/s]
100%|█████████▉| 1499/1500 [00:00<00:00, 31127.99it/s]
100%|██████████| 1500/1500 [00:15<00:00, 94.25it/s] 
100%|██████████| 1500/1500 [00:03<00:00, 489.96it/s]


In [8]:
tz_nan_features

['day_month_regexp',
 'day_month_regexp_next_1',
 'day_month_regexp_next_3',
 'day_month_regexp_prev_1',
 'day_month_regexp_prev_3',
 'is_toc_line_next_1',
 'is_toc_line_next_2',
 'is_toc_line_next_3',
 'is_toc_line_prev_1',
 'is_toc_line_prev_2',
 'is_toc_line_prev_3',
 'is_tz_line_next_3',
 'is_upper_next_3',
 'start_regexp_0_next_1',
 'start_regexp_0_next_2',
 'start_regexp_0_next_3',
 'start_regexp_0_prev_1',
 'start_regexp_0_prev_2',
 'start_regexp_1_next_1',
 'start_regexp_1_next_2',
 'start_regexp_1_next_3',
 'start_regexp_1_prev_1',
 'start_regexp_1_prev_2',
 'start_regexp_1_prev_3',
 'start_regexp_3',
 'start_regexp_3_next_1',
 'start_regexp_3_next_2',
 'start_regexp_3_next_3',
 'start_regexp_3_prev_1',
 'start_regexp_3_prev_2',
 'start_regexp_3_prev_3',
 'year_regexp_next_2']

In [9]:
tz_zero_features

['start_regexp_1']

In [10]:
tz_100_features

['dot_number_regexp_len_next_1', 'named_item_regexp_prev_3']

__TESTS:__

1: nan\
2: nan + 100\
3: nan + 1000\
4: nan + 10000\
5: nan + zero\
6: nan + zero + 100\
7: nan + zero + 1000\
8: nan + zero + 10000\
9: zero\
10: zero + 100\
11: zero + 1000\
12: zero + 10000\
13: 100\
14: 1000\
15: 10000

In [71]:
def test_func(law_features_to_drop, tz_features_to_drop):
    law = pd.read_csv(os.path.join(root, 'law/dataset_prepared/dataset.csv'), index_col=0)
    tz = pd.read_csv(os.path.join(root, 'tz/dataset_prepared/dataset.csv'), index_col=0)
    
    for feature in law_features_to_drop:
        law.drop([feature], axis=1, inplace=True)
        
    for feature in tz_features_to_drop:
        tz.drop([feature], axis=1, inplace=True)
        
    law_prepared = data_preprocessing(law, type='law')
    tz_prepared = data_preprocessing(tz, type='tz')
    
    law_dfs_of_every_doc = split_data_by_docs(law_prepared)
    tz_dfs_of_every_doc = split_data_by_docs(tz_prepared)
    
    law_train_val, law_test = my_train_test_split(law_dfs_of_every_doc, test_size=0.8)
    tz_train_val, tz_test = my_train_test_split(tz_dfs_of_every_doc, test_size=0.8)
    
    law_model = XGBClassifier(learning_rate=0.8,
                              n_estimators=300,
                              booster="gbtree",
                              tree_method="gpu_hist",
                              max_depth=5,
                              random_state=42,
                              verbosity=0)

    tz_model = XGBClassifier(learning_rate=0.8,
                             n_estimators=300,
                             booster="gbtree",
                             tree_method="gpu_hist",
                             max_depth=5,
                             random_state=42,
                             verbosity=0)
    
    _, _, _, law_metrics = my_cross_validate(law_model, law_train_val, law_test)
    _, _, _, tz_metrics = my_cross_validate(tz_model, tz_train_val, tz_test)
    
    main_key = 'other_test_metrics'
    
    print('baseline law results:\n')
    print('best_train_accuracy: {}\n'.format(law_metrics['best_train_accuracy']))
    print('test_accuracy: {}'.format(law_metrics['test_accuracy']))
    for metric in law_metrics[main_key].keys():
        print(f'\n{metric}:')
        for average in law_metrics[main_key][metric].keys():
            print(f'{average}: {law_metrics[main_key][metric][average]}')
            
    print('baseline tz results:\n')
    print('best_train_accuracy: {}\n'.format(tz_metrics['best_train_accuracy']))
    print('test_accuracy: {}'.format(tz_metrics['test_accuracy']))
    for metric in tz_metrics[main_key].keys():
        print(f'\n{metric}:')
        for average in tz_metrics[main_key][metric].keys():
            print(f'{average}: {tz_metrics[main_key][metric][average]}')

__TEST 1:__

In [72]:
test_func(law_nan_features, tz_nan_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9881831610044313

test_accuracy: 0.9829529037850332

f1:
None: [0.84805654 0.98398398 0.98905814 0.88372093 0.89      ]
micro: 0.9829529037850332
macro: 0.9189639186697244
weighted: 0.9830514584945006

precision:
None: [0.82758621 0.9894313  0.98799829 0.9047619  0.86407767]
micro: 0.9829529037850332
macro: 0.9147710741006586
weighted: 0.9832150676479011

recall:
None: [0.86956522 0.97859632 0.99012027 0.86363636 0.91752577]
micro: 0.9829529037850332
macro: 0.9238887891426089
weighted: 0.9829529037850332

roc_auc:
macro: 0.9978144312951764
weighted: 0.9977834151840235
baseline tz results:

best_train_accuracy: 0.9690721649484536

test_accuracy: 0.9565217391304348

f1:
None: [0.93975904 0.78740157 0.9815818  0.92063492 0.98533724]
micro: 0.9565217391304348
macro: 0.922942914693523
weighted: 0.9564135501605822

precision:
None:

__TEST 2:__

In [73]:
test_func(law_nan_features + law_100_features, tz_nan_features + tz_100_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9876907927129492

test_accuracy: 0.9815082346142733

f1:
None: [0.81403509 0.98374594 0.98808629 0.88372093 0.89      ]
micro: 0.9815082346142733
macro: 0.9119176496614789
weighted: 0.9816504006403208

precision:
None: [0.78911565 0.98844221 0.98755632 0.9047619  0.86407767]
micro: 0.9815082346142733
macro: 0.9067907500730419
weighted: 0.9818637497092075

recall:
None: [0.84057971 0.97909408 0.98861684 0.86363636 0.91752577]
micro: 0.9815082346142733
macro: 0.9178905524240385
weighted: 0.9815082346142733

roc_auc:
macro: 0.9977712780274235
weighted: 0.9976016845112655
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9554865424430642

f1:
None: [0.94230769 0.8        0.97937025 0.91338583 0.97959184]
micro: 0.9554865424430642
macro: 0.9229311211085192
weighted: 0.9552715369926565

precision:
None

__TEST 3:__

In [74]:
test_func(law_nan_features + law_1000_features, tz_nan_features + tz_1000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9896602658788775

test_accuracy: 0.9815082346142733

f1:
None: [0.81403509 0.98374594 0.98808629 0.88372093 0.89      ]
micro: 0.9815082346142733
macro: 0.9119176496614789
weighted: 0.9816504006403208

precision:
None: [0.78911565 0.98844221 0.98755632 0.9047619  0.86407767]
micro: 0.9815082346142733
macro: 0.9067907500730419
weighted: 0.9818637497092075

recall:
None: [0.84057971 0.97909408 0.98861684 0.86363636 0.91752577]
micro: 0.9815082346142733
macro: 0.9178905524240385
weighted: 0.9815082346142733

roc_auc:
macro: 0.9977712780274235
weighted: 0.9976016845112655
baseline tz results:

best_train_accuracy: 0.9711340206185567

test_accuracy: 0.9575569358178054

f1:
None: [0.93975904 0.78740157 0.98264642 0.93650794 0.98245614]
micro: 0.9575569358178054
macro: 0.9257542217261673
weighted: 0.9574374384526222

precision:
None

__TEST 4:__

In [75]:
test_func(law_nan_features + law_10000_features, tz_nan_features + tz_10000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9876907927129492

test_accuracy: 0.9813637676971974

f1:
None: [0.83032491 0.98323743 0.98756965 0.9047619  0.87755102]
micro: 0.9813637676971974
macro: 0.9166889831591915
weighted: 0.9813724855383053

precision:
None: [0.82733813 0.98843058 0.98545766 0.95       0.86868687]
micro: 0.9813637676971974
macro: 0.9239826475601232
weighted: 0.9814191241352678

recall:
None: [0.83333333 0.97809856 0.98969072 0.86363636 0.88659794]
micro: 0.9813637676971974
macro: 0.9102713826518561
weighted: 0.9813637676971974

roc_auc:
macro: 0.9966972664044551
weighted: 0.9974438934849834
baseline tz results:

best_train_accuracy: 0.9711340206185567

test_accuracy: 0.9575569358178054

f1:
None: [0.93975904 0.78740157 0.98264642 0.93650794 0.98245614]
micro: 0.9575569358178054
macro: 0.9257542217261673
weighted: 0.9574374384526222

precision:
None

__TEST 5:__

In [84]:
test_func(law_nan_features + law_zero_features, tz_nan_features + tz_zero_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9891678975873953

test_accuracy: 0.9802080323605894

f1:
None: [0.84805654 0.98250875 0.98668385 0.88372093 0.84313725]
micro: 0.9802080323605894
macro: 0.9088214633322858
weighted: 0.9803695530591976

precision:
None: [0.82758621 0.98645258 0.98668385 0.9047619  0.80373832]
micro: 0.9802080323605894
macro: 0.9018445724513743
weighted: 0.9806208489776974

recall:
None: [0.86956522 0.97859632 0.98668385 0.86363636 0.88659794]
micro: 0.9802080323605894
macro: 0.9170159369089319
weighted: 0.9802080323605894

roc_auc:
macro: 0.9979035129499838
weighted: 0.9974151333817914
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9596273291925466

f1:
None: [0.94285714 0.816      0.98043478 0.92913386 0.98823529]
micro: 0.9596273291925466
macro: 0.9313322155702405
weighted: 0.9594824806841004

precision:
None

__TEST 6:__

In [85]:
test_func(law_nan_features + law_zero_features + law_100_features, tz_nan_features + tz_zero_features + tz_100_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9886755292959133

test_accuracy: 0.9813637676971974

f1:
None: [0.82685512 0.98249125 0.98819742 0.88372093 0.88      ]
micro: 0.9813637676971974
macro: 0.9122529448845971
weighted: 0.9814764504895857

precision:
None: [0.80689655 0.98743087 0.98734991 0.9047619  0.85436893]
micro: 0.9813637676971974
macro: 0.908161634509079
weighted: 0.9816498252719217

recall:
None: [0.84782609 0.9776008  0.98904639 0.86363636 0.90721649]
micro: 0.9813637676971974
macro: 0.9170652267213901
weighted: 0.9813637676971974

roc_auc:
macro: 0.996614351137679
weighted: 0.9973484740942772
baseline tz results:

best_train_accuracy: 0.9661538461538461

test_accuracy: 0.9627329192546584

f1:
None: [0.95192308 0.84126984 0.9815418  0.921875   0.98533724]
micro: 0.9627329192546584
macro: 0.9363893927966771
weighted: 0.9626746891891028

precision:
None: 

__TEST 7:__

In [86]:
test_func(law_nan_features + law_zero_features + law_1000_features, tz_nan_features + tz_zero_features + tz_1000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9896602658788775

test_accuracy: 0.9810748338630454

f1:
None: [0.83985765 0.98203593 0.98764636 0.9047619  0.87254902]
micro: 0.9810748338630454
macro: 0.9173701734986064
weighted: 0.9811953224306453

precision:
None: [0.82517483 0.98449225 0.98796475 0.95       0.8317757 ]
micro: 0.9810748338630454
macro: 0.9158815052309335
weighted: 0.9814020782415164

recall:
None: [0.85507246 0.97959184 0.98732818 0.86363636 0.91752577]
micro: 0.9810748338630454
macro: 0.9206309232058416
weighted: 0.9810748338630454

roc_auc:
macro: 0.9975966424357467
weighted: 0.9974036207051297
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9554865424430642

f1:
None: [0.94230769 0.784      0.98047722 0.91338583 0.98245614]
micro: 0.9554865424430642
macro: 0.920525376571511
weighted: 0.9552420226655028

precision:
None:

__TEST 8:__

In [87]:
test_func(law_nan_features + law_zero_features + law_10000_features, tz_nan_features + tz_zero_features + tz_10000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9901526341703595

test_accuracy: 0.9803524992776654

f1:
None: [0.80286738 0.98327926 0.98754028 0.88372093 0.85853659]
micro: 0.9803524992776654
macro: 0.9031888879346595
weighted: 0.9804841352769019

precision:
None: [0.79432624 0.98598599 0.98775247 0.9047619  0.81481481]
micro: 0.9803524992776654
macro: 0.8975282835380304
weighted: 0.9806963552450486

recall:
None: [0.8115942  0.98058736 0.98732818 0.86363636 0.90721649]
micro: 0.9803524992776654
macro: 0.9100725193936821
weighted: 0.9803524992776654

roc_auc:
macro: 0.9978091130107508
weighted: 0.9975231735964939
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9554865424430642

f1:
None: [0.94230769 0.784      0.98047722 0.91338583 0.98245614]
micro: 0.9554865424430642
macro: 0.920525376571511
weighted: 0.9552420226655028

precision:
None:

__TEST 9:__

In [88]:
test_func(law_zero_features, tz_zero_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9901526341703595

test_accuracy: 0.9806414331118174

f1:
None: [0.82105263 0.98327926 0.98774721 0.85714286 0.85436893]
micro: 0.9806414331118174
macro: 0.9007181775112774
weighted: 0.9808429958367418

precision:
None: [0.79591837 0.98598599 0.98859725 0.9        0.80733945]
micro: 0.9806414331118174
macro: 0.8955682098003152
weighted: 0.981176435295326

recall:
None: [0.84782609 0.98058736 0.98689863 0.81818182 0.90721649]
micro: 0.9806414331118174
macro: 0.9081420764614464
weighted: 0.9806414331118174

roc_auc:
macro: 0.9976276551336133
weighted: 0.9973793180295351
baseline tz results:

best_train_accuracy: 0.9752577319587629

test_accuracy: 0.9648033126293996

f1:
None: [0.95192308 0.83464567 0.98371336 0.9375     0.98823529]
micro: 0.9648033126293996
macro: 0.9392034790761844
weighted: 0.9647842449793214

precision:
None:

__TEST 10:__

In [89]:
test_func(law_zero_features + law_100_features, tz_zero_features + tz_100_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9890267175572519

test_accuracy: 0.9806414331118174

f1:
None: [0.82926829 0.98354935 0.987094   0.85714286 0.86829268]
micro: 0.9806414331118174
macro: 0.9050694366812362
weighted: 0.9808409222100677

precision:
None: [0.79865772 0.98502247 0.98858251 0.9        0.82407407]
micro: 0.9806414331118174
macro: 0.8992673532070563
weighted: 0.9811759954043945

recall:
None: [0.86231884 0.98208064 0.98560997 0.81818182 0.91752577]
micro: 0.9806414331118174
macro: 0.9131434069452092
weighted: 0.9806414331118174

roc_auc:
macro: 0.9977933460713337
weighted: 0.997522774753772
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9575569358178054

f1:
None: [0.93975904 0.80620155 0.98047722 0.92063492 0.98823529]
micro: 0.9575569358178054
macro: 0.9270616049424149
weighted: 0.9576378310431654

precision:
None:

__TEST 11:__

In [90]:
test_func(law_zero_features + law_1000_features, tz_zero_features + tz_1000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9890267175572519

test_accuracy: 0.9806414331118174

f1:
None: [0.82926829 0.98354935 0.987094   0.85714286 0.86829268]
micro: 0.9806414331118174
macro: 0.9050694366812362
weighted: 0.9808409222100677

precision:
None: [0.79865772 0.98502247 0.98858251 0.9        0.82407407]
micro: 0.9806414331118174
macro: 0.8992673532070563
weighted: 0.9811759954043945

recall:
None: [0.86231884 0.98208064 0.98560997 0.81818182 0.91752577]
micro: 0.9806414331118174
macro: 0.9131434069452092
weighted: 0.9806414331118174

roc_auc:
macro: 0.9977933460713337
weighted: 0.997522774753772
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9575569358178054

f1:
None: [0.94964029 0.816      0.97826087 0.90625    0.98245614]
micro: 0.9575569358178054
macro: 0.9265214595371758
weighted: 0.9574305689464944

precision:
None:

__TEST 12:__

In [91]:
test_func(law_zero_features + law_10000_features, tz_zero_features + tz_10000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9881831610044313

test_accuracy: 0.9804969661947414

f1:
None: [0.82068966 0.98350825 0.98731728 0.88372093 0.85853659]
micro: 0.9804969661947414
macro: 0.9067545399073934
weighted: 0.9807559111178807

precision:
None: [0.78289474 0.9874561  0.98816695 0.9047619  0.81481481]
micro: 0.9804969661947414
macro: 0.8956189012568808
weighted: 0.9811739245587449

recall:
None: [0.86231884 0.97959184 0.98646907 0.86363636 0.90721649]
micro: 0.9804969661947414
macro: 0.9198465215922154
weighted: 0.9804969661947414

roc_auc:
macro: 0.9977678115292508
weighted: 0.9976149852657478
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9575569358178054

f1:
None: [0.94964029 0.816      0.97826087 0.90625    0.98245614]
micro: 0.9575569358178054
macro: 0.9265214595371758
weighted: 0.9574305689464944

precision:
None

__TEST 13:__

In [92]:
test_func(law_100_features, tz_100_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9867060561299852

test_accuracy: 0.9815082346142733

f1:
None: [0.8409894  0.98172716 0.9883135  0.9047619  0.87309645]
micro: 0.9815082346142733
macro: 0.9177776816299517
weighted: 0.9815846815120995

precision:
None: [0.82068966 0.98741188 0.98672661 0.95       0.86      ]
micro: 0.9815082346142733
macro: 0.9209656298717516
weighted: 0.9817227313727137

recall:
None: [0.86231884 0.97610752 0.9899055  0.86363636 0.88659794]
micro: 0.9815082346142733
macro: 0.9157132313638787
weighted: 0.9815082346142733

roc_auc:
macro: 0.9978059218560491
weighted: 0.9975318895245651
baseline tz results:

best_train_accuracy: 0.9670103092783505

test_accuracy: 0.9544513457556936

f1:
None: [0.9375     0.78740157 0.98047722 0.92063492 0.97947214]
micro: 0.9544513457556936
macro: 0.9210971719255732
weighted: 0.9543704294814184

precision:
None

__TEST 14:__

In [93]:
test_func(law_1000_features, tz_1000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9867060561299852

test_accuracy: 0.9825195030338053

f1:
None: [0.84507042 0.98323743 0.98905814 0.9047619  0.87437186]
micro: 0.9825195030338053
macro: 0.9192999513388518
weighted: 0.9826231223419499

precision:
None: [0.82191781 0.98843058 0.98799829 0.95       0.85294118]
micro: 0.9825195030338053
macro: 0.920257570732025
weighted: 0.982799337048645

recall:
None: [0.86956522 0.97809856 0.99012027 0.86363636 0.89690722]
micro: 0.9825195030338053
macro: 0.9196655257864744
weighted: 0.9825195030338053

roc_auc:
macro: 0.9976859745481483
weighted: 0.997513044128292
baseline tz results:

best_train_accuracy: 0.9649484536082474

test_accuracy: 0.9554865424430642

f1:
None: [0.93779904 0.80645161 0.97937025 0.92063492 0.97959184]
micro: 0.9554865424430642
macro: 0.9247695326127194
weighted: 0.9551887619957843

precision:
None: [

__TEST 15:__

In [94]:
test_func(law_10000_features, tz_10000_features)

law data successfully preprocessed
tz data successfully preprocessed
starting cross validate
starting cross validate
baseline law results:

best_train_accuracy: 0.9891678975873953

test_accuracy: 0.9833863045362612

f1:
None: [0.84397163 0.98574644 0.98916658 0.9047619  0.87878788]
micro: 0.9833863045362612
macro: 0.9204868857293199
weighted: 0.9834642366695038

precision:
None: [0.82638889 0.99045226 0.98800086 0.95       0.86138614]
micro: 0.9833863045362612
macro: 0.9232456291781841
weighted: 0.9835953056413085

recall:
None: [0.86231884 0.98108512 0.99033505 0.86363636 0.89690722]
micro: 0.9833863045362612
macro: 0.9188565178461859
weighted: 0.9833863045362612

roc_auc:
macro: 0.9974390913509753
weighted: 0.9975764657210031
baseline tz results:

best_train_accuracy: 0.9649484536082474

test_accuracy: 0.9554865424430642

f1:
None: [0.93779904 0.80645161 0.97937025 0.92063492 0.97959184]
micro: 0.9554865424430642
macro: 0.9247695326127194
weighted: 0.9551887619957843

precision:
None