In [50]:
import torch
import pandas as pd
import numpy as np
import sklearn
import random
import os

from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import f1_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

from datetime import datetime
import wandb

import pickle
from joblib import dump, load

In [51]:
pd.options.display.float_format = '{:.2f}'.format

In [52]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mbeomgon-yu[0m. Use [1m`wandb login --relogin`[0m to force relogin


## hyperparameter for sweeping using wandb

In [53]:
test_num = 15

# use random search
sweep_config = {
    'method': 'random'
    }

# for bayesian search, this value should be included
metric = {
    'name': 'f1_score',
    'goal': 'maximize'   
    }

sweep_config['metric'] = metric    

parameters_dict = {
    'seed': {
        'distribution': 'int_uniform',
        'min': 0,
        'max': 30,
        },
    }

parameters_dict.update({
    'support_fraction': {
        'distribution': 'uniform',
        # 'q':1e-5,
        'min': 0.99,
        'max': 0.999,
    }
    })

sweep_config['parameters'] = parameters_dict

parameters_dict.update({
    'val_ratio': {
        # a q_log_uniform_values distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0.0002,
        'max': 0.003,
      },
    'topk': {
        # 'value': 29,
        'distribution': 'int_uniform',
        'min': 28,
        'max': 32,
      },
    'denom': {
        'distribution': 'int_uniform',
        'min': 2,
        'max': 6,
      },    
    })

In [54]:
import pprint

pprint.pprint(sweep_config)
sweep_id = wandb.sweep(sweep_config, project="EllipseEnvelope Sweep")

{'method': 'random',
 'metric': {'goal': 'maximize', 'name': 'f1_score'},
 'parameters': {'denom': {'distribution': 'int_uniform', 'max': 6, 'min': 2},
                'seed': {'distribution': 'int_uniform', 'max': 30, 'min': 0},
                'support_fraction': {'distribution': 'uniform',
                                     'max': 0.999,
                                     'min': 0.99},
                'topk': {'distribution': 'int_uniform', 'max': 32, 'min': 28},
                'val_ratio': {'distribution': 'uniform',
                              'max': 0.003,
                              'min': 0.0002}}}
Create sweep with ID: aplf5edd
Sweep URL: https://wandb.ai/beomgon-yu/EllipseEnvelope%20Sweep/sweeps/aplf5edd


In [55]:
# with wandb.init(config=None) :
#     config = wandb.config
#     config.support_fraction

## Data load

In [56]:
train = pd.read_csv('../dataset/train.csv')
val = pd.read_csv('../dataset/val.csv')
test = pd.read_csv('../dataset/test.csv')
print(train.shape)

(113842, 31)


In [57]:
val_normal, val_abnormal = val.Class.value_counts()
val_ratio = val_abnormal / val_normal
print(val_normal, val_abnormal, val_ratio)


28432 30 0.0010551491277433877


In [58]:
def seed_everything(seed) :
    random.seed(seed)
    os.environ['PYHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True  

In [59]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
col=[col for col in val.columns if col not in ['ID','Class']]

for i in col:
    sc=StandardScaler()
    scaler = sc.fit(train[i].values.reshape(-1,1))
    train[i] = scaler.transform(train[i].values.reshape(-1,1))
    val[i] = scaler.transform(val[i].values.reshape(-1,1))
    test[i] = scaler.transform(test[i].values.reshape(-1,1))

In [60]:
def get_pred(model, x, k, thres) :
    prob = model.score_samples(x)
    prob = torch.tensor(prob, dtype=torch.float)
    if k is not None :
        topk_indices = torch.topk(prob, k=k, largest=False).indices

        pred = torch.zeros(len(x), dtype=torch.int8)
        pred[topk_indices] = 1
        
        return pred.tolist(), prob.tolist(), prob[topk_indices][-1]
    else :
        # pred = torch.zeros(len(x), dtype=torch.int8)
        pred = torch.where(prob<thres, 1, 0)
        
        return pred.tolist(), prob.tolist(), _
    
    

In [61]:
train = train.drop(columns=['ID'])
val_x = val.drop(columns=['ID', 'Class'])
val_y = val['Class']
test_x = test.drop(columns=['ID'])

In [62]:
submit = pd.read_csv('../dataset/sample_submission.csv')
submit.shape

(142503, 2)

In [63]:
def main() :
    
    with wandb.init(config=None) as run :
        print(run.name)
        
        config = wandb.config
        seed_everything(config.seed) 
        train_x = train.sample(frac=1)[:len(train)//config.denom]
        # print(train_x.shape)
        # print(config.support_fraction)
        # val_ratio = 0.0010551491277433877
        
        model = EllipticEnvelope(support_fraction=config.support_fraction, 
                                 contamination=config.val_ratio, random_state=config.seed)
        model.fit(train_x) 
        
        val_pred, val_prob, prob_thres = get_pred(model, val_x, config.topk, None)
        val_score = f1_score(val_y, val_pred, average='macro')
        
        wandb.log({'f1_score' : val_score})
        
        print(classification_report(val_y, val_pred))    
        print(prob_thres)
        
        tn, fp, fn, tp = confusion_matrix(val_y, val_pred).ravel()
        print('tp : ', tp, ', fp : ', fp, ', tn : ', tn, ', fn : ', fn)     
        
        wrong_answer = np.where(np.array(val_y) != np.array(val_pred))[0]
        print(wrong_answer) 
        
        if val_score > 0.8 :
            test_pred, test_prob, _ = get_pred(model, test_x, None, prob_thres)
            
            # save test result
            submit = pd.read_csv('../dataset/sample_submission.csv')
            submit['Class'] = test_pred
            submit.to_csv('results/'+run.name+'.csv', index=False)
            
            val_df = pd.read_csv('../dataset/val.csv')
            val_df = val_df.drop(columns=val_df.columns[1:], axis=1)
            val_df['Class'] = val_pred
            val_df.to_csv('val_results/'+run.name+'.csv',index=False)

            # save model
            # saved_model = pickle.dumps(model)
            dump(model, 'models/'+run.name+'.joblib')
            
            # wandb.save("./results/eval/*", base_path="./results", policy="now")        

In [64]:
wandb.agent(sweep_id, main, count=test_num)

[34m[1mwandb[0m: Agent Starting Run: 5s8zdsua with config:
[34m[1mwandb[0m: 	denom: 3
[34m[1mwandb[0m: 	seed: 21
[34m[1mwandb[0m: 	support_fraction: 0.9912575459610292
[34m[1mwandb[0m: 	topk: 30
[34m[1mwandb[0m: 	val_ratio: 0.001922639629882452


rural-sweep-1
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.67      0.67      0.67        30

    accuracy                           1.00     28462
   macro avg       0.83      0.83      0.83     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-6.9687e+08)
tp :  20 , fp :  10 , tn :  28422 , fn :  10
[   71  1047  1210  3197  4039  4917  9326 12377 13706 14221 15306 17534
 19226 20225 24110 25504 26010 27905 27998 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.83316


[34m[1mwandb[0m: Agent Starting Run: 4dzl1h3d with config:
[34m[1mwandb[0m: 	denom: 6
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	support_fraction: 0.9965421492082428
[34m[1mwandb[0m: 	topk: 29
[34m[1mwandb[0m: 	val_ratio: 0.0015278281284054117


balmy-sweep-2
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-3433857.5000)
tp :  25 , fp :  4 , tn :  28428 , fn :  5
[   71  1047  1210  4039  7000  9326 14221 15306 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.92365


[34m[1mwandb[0m: Agent Starting Run: vj4lnsti with config:
[34m[1mwandb[0m: 	denom: 4
[34m[1mwandb[0m: 	seed: 4
[34m[1mwandb[0m: 	support_fraction: 0.992708114780334
[34m[1mwandb[0m: 	topk: 31
[34m[1mwandb[0m: 	val_ratio: 0.0005074612167550485


deep-sweep-3
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.81      0.83      0.82        30

    accuracy                           1.00     28462
   macro avg       0.90      0.92      0.91     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-296238.1875)
tp :  25 , fp :  6 , tn :  28426 , fn :  5
[   71  1047  1210  4039  4917  7000  9326 14221 15306 19113 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.90974


[34m[1mwandb[0m: Agent Starting Run: tjvzji33 with config:
[34m[1mwandb[0m: 	denom: 6
[34m[1mwandb[0m: 	seed: 30
[34m[1mwandb[0m: 	support_fraction: 0.9923108223615674
[34m[1mwandb[0m: 	topk: 29
[34m[1mwandb[0m: 	val_ratio: 0.0023991042711351573


gallant-sweep-4
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.66      0.63      0.64        30

    accuracy                           1.00     28462
   macro avg       0.83      0.82      0.82     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-6.5448e+08)
tp :  19 , fp :  10 , tn :  28422 , fn :  11
[   71  1047  1210  3197  4039  4917  9326 12377 12797 13706 14221 15306
 17534 19226 20225 24110 25504 26010 27905 27998 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.82185


[34m[1mwandb[0m: Agent Starting Run: 842pf8fq with config:
[34m[1mwandb[0m: 	denom: 6
[34m[1mwandb[0m: 	seed: 19
[34m[1mwandb[0m: 	support_fraction: 0.9906566177818672
[34m[1mwandb[0m: 	topk: 31
[34m[1mwandb[0m: 	val_ratio: 0.00042953560241946006


comic-sweep-5
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.65      0.67      0.66        30

    accuracy                           1.00     28462
   macro avg       0.82      0.83      0.83     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-5.0510e+08)
tp :  20 , fp :  11 , tn :  28421 , fn :  10
[   71  1047  1210  3197  4039  4917  9326 10880 12377 13706 14221 15306
 17534 19226 20225 24110 25504 26010 27905 27998 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.82768


[34m[1mwandb[0m: Agent Starting Run: enrpj427 with config:
[34m[1mwandb[0m: 	denom: 4
[34m[1mwandb[0m: 	seed: 19
[34m[1mwandb[0m: 	support_fraction: 0.9905442664094508
[34m[1mwandb[0m: 	topk: 32
[34m[1mwandb[0m: 	val_ratio: 0.0006280294157332209


comic-sweep-6
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.62      0.67      0.65        30

    accuracy                           1.00     28462
   macro avg       0.81      0.83      0.82     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-5.4412e+08)
tp :  20 , fp :  12 , tn :  28420 , fn :  10
[   71  1047  1210  3197  4039  4917  9326 10880 12377 13706 14221 15306
 17534 19226 20225 21892 24110 25504 26010 27905 27998 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.82239


[34m[1mwandb[0m: Agent Starting Run: hlyl0fxf with config:
[34m[1mwandb[0m: 	denom: 2
[34m[1mwandb[0m: 	seed: 0
[34m[1mwandb[0m: 	support_fraction: 0.993215046664082
[34m[1mwandb[0m: 	topk: 31
[34m[1mwandb[0m: 	val_ratio: 0.002214136300605513


peach-sweep-7
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.65      0.67      0.66        30

    accuracy                           1.00     28462
   macro avg       0.82      0.83      0.83     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-4.7915e+08)
tp :  20 , fp :  11 , tn :  28421 , fn :  10
[   71  1047  1210  3197  4039  4917  9326 10880 12377 13706 14221 15306
 17534 19226 20225 24110 25504 26010 27905 27998 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.82768


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6l3dowzz with config:
[34m[1mwandb[0m: 	denom: 5
[34m[1mwandb[0m: 	seed: 14
[34m[1mwandb[0m: 	support_fraction: 0.9928283444493836
[34m[1mwandb[0m: 	topk: 29
[34m[1mwandb[0m: 	val_ratio: 0.0009829863553392532


icy-sweep-8
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-3553285.5000)
tp :  25 , fp :  4 , tn :  28428 , fn :  5
[   71  1047  1210  4039  7000  9326 14221 15306 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.92365


[34m[1mwandb[0m: Agent Starting Run: vquemxgj with config:
[34m[1mwandb[0m: 	denom: 5
[34m[1mwandb[0m: 	seed: 10
[34m[1mwandb[0m: 	support_fraction: 0.9949982329219128
[34m[1mwandb[0m: 	topk: 31
[34m[1mwandb[0m: 	val_ratio: 0.000531835824977183


exalted-sweep-9
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.81      0.83      0.82        30

    accuracy                           1.00     28462
   macro avg       0.90      0.92      0.91     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-359485.5625)
tp :  25 , fp :  6 , tn :  28426 , fn :  5
[   71  1047  1210  4039  4917  7000  9326 14221 15306 19113 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.90974


[34m[1mwandb[0m: Agent Starting Run: b1tjm7gf with config:
[34m[1mwandb[0m: 	denom: 6
[34m[1mwandb[0m: 	seed: 7
[34m[1mwandb[0m: 	support_fraction: 0.9940514039419256
[34m[1mwandb[0m: 	topk: 28
[34m[1mwandb[0m: 	val_ratio: 0.002905903004059433


firm-sweep-10
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.80      0.83        30

    accuracy                           1.00     28462
   macro avg       0.93      0.90      0.91     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-4903978.5000)
tp :  24 , fp :  4 , tn :  28428 , fn :  6
[   71  1047  1210  4039  7000  9326 14221 15306 24110 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.91371


[34m[1mwandb[0m: Agent Starting Run: yh4rxgov with config:
[34m[1mwandb[0m: 	denom: 6
[34m[1mwandb[0m: 	seed: 19
[34m[1mwandb[0m: 	support_fraction: 0.9968564590482146
[34m[1mwandb[0m: 	topk: 31
[34m[1mwandb[0m: 	val_ratio: 0.0013545610277499326


rich-sweep-11
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.81      0.83      0.82        30

    accuracy                           1.00     28462
   macro avg       0.90      0.92      0.91     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-230100.8125)
tp :  25 , fp :  6 , tn :  28426 , fn :  5
[   71  1047  1210  4039  7000  9326 14221 15306 19113 21318 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.90974


[34m[1mwandb[0m: Agent Starting Run: 20b174x0 with config:
[34m[1mwandb[0m: 	denom: 2
[34m[1mwandb[0m: 	seed: 15
[34m[1mwandb[0m: 	support_fraction: 0.9958442844399756
[34m[1mwandb[0m: 	topk: 29
[34m[1mwandb[0m: 	val_ratio: 0.0012306276747549307


serene-sweep-12
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-3455145.)
tp :  25 , fp :  4 , tn :  28428 , fn :  5
[   71  1047  1210  4039  7000  9326 14221 15306 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.92365


[34m[1mwandb[0m: Agent Starting Run: w4r7ct2u with config:
[34m[1mwandb[0m: 	denom: 2
[34m[1mwandb[0m: 	seed: 11
[34m[1mwandb[0m: 	support_fraction: 0.9937079657282778
[34m[1mwandb[0m: 	topk: 32
[34m[1mwandb[0m: 	val_ratio: 0.0016222483337669418


sparkling-sweep-13
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.78      0.83      0.81        30

    accuracy                           1.00     28462
   macro avg       0.89      0.92      0.90     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-256941.3906)
tp :  25 , fp :  7 , tn :  28425 , fn :  5
[   71  1047  1210  4039  4917  7000  9326 14221 15306 19113 21318 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.90312


[34m[1mwandb[0m: Agent Starting Run: k8z13xsw with config:
[34m[1mwandb[0m: 	denom: 5
[34m[1mwandb[0m: 	seed: 30
[34m[1mwandb[0m: 	support_fraction: 0.9980600417791208
[34m[1mwandb[0m: 	topk: 29
[34m[1mwandb[0m: 	val_ratio: 0.001287504923908067


clean-sweep-14
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-3302970.5000)
tp :  25 , fp :  4 , tn :  28428 , fn :  5
[   71  1047  1210  4039  7000  9326 14221 15306 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.92365


[34m[1mwandb[0m: Agent Starting Run: t7xoevtt with config:
[34m[1mwandb[0m: 	denom: 4
[34m[1mwandb[0m: 	seed: 3
[34m[1mwandb[0m: 	support_fraction: 0.9977779146562
[34m[1mwandb[0m: 	topk: 30
[34m[1mwandb[0m: 	val_ratio: 0.0011201471512946365


different-sweep-15
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.83      0.83      0.83        30

    accuracy                           1.00     28462
   macro avg       0.92      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462

tensor(-3439032.7500)
tp :  25 , fp :  5 , tn :  28427 , fn :  5
[   71  1047  1210  4039  7000  9326 14221 15306 19113 28146]


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1_score,▁

0,1
f1_score,0.91658


In [65]:
import os
import glob

In [66]:
val = pd.read_csv('../dataset/val.csv')
val_y = val['Class'].values
val = val.drop(columns=val.columns[1:],axis=1)
csv_list = glob.glob('val_results/*.csv')

for i, csv in enumerate(csv_list) :
    li = pd.read_csv(csv)['Class'].values
    val['Class'+str(i)] = li

In [67]:
val.head()

Unnamed: 0,ID,Class0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,69,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [68]:
val['Count'] = val.apply(lambda x : sum(list(x[1:].values)), axis=1)
val.head()

Unnamed: 0,ID,Class0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14,Count
0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,69,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [69]:
val.Count.value_counts()

0     28422
15       22
5         7
10        5
8         1
3         1
14        1
2         1
1         1
9         1
Name: Count, dtype: int64

In [70]:
def set_label(x) :
    if x == 0 :
        return 0
    elif x == test_num-1 :
        return 1

    elif x > test_num//2 :
        return 1
    else :
        return 0

In [71]:
val['Class'] = val.Count.apply(lambda x : set_label(x))
pred_y = val['Class'].values

In [72]:
f1_score(val_y, pred_y, average='macro')

0.9165787375726882

In [73]:
# os.listdir('results/')
submit = pd.read_csv('../dataset/sample_submission.csv')
csv_list = glob.glob('results/*.csv')

for i, csv in enumerate(csv_list) :
    li = pd.read_csv(csv)['Class'].values
    submit['Class'+str(i)] = li
    

In [74]:
submit = submit.drop('Class', axis=1)
submit.head()

Unnamed: 0,ID,Class0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,AAAA0x1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AAAA0x2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AAAA0x5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AAAA0x7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AAAA0xc,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [75]:
submit['Count'] = submit.apply(lambda x : sum(list(x[1:].values)), axis=1)
submit.head()

Unnamed: 0,ID,Class0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14,Count
0,AAAA0x1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AAAA0x2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AAAA0x5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AAAA0x7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AAAA0xc,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [76]:
submit.Count.value_counts()

0     142123
15       266
5         38
10        33
1          9
8          8
4          8
14         7
3          5
13         4
9          1
2          1
Name: Count, dtype: int64

In [77]:
submit['Class'] = submit.Count.apply(lambda x : set_label(x))
submit.head()

Unnamed: 0,ID,Class0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14,Count,Class
0,AAAA0x1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,AAAA0x2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AAAA0x5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,AAAA0x7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,AAAA0xc,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [78]:
columns = submit.columns[1:-1]
submit.drop(columns=columns, inplace=True)
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [79]:
submit.to_csv('./submit_EllipticEnvelope_ensseble.csv', index=False)

In [80]:
submit.Class.value_counts()

0    142184
1       319
Name: Class, dtype: int64