In [None]:
import pandas as pd, numpy as np, os
from sklearn.metrics import roc_auc_score,log_loss
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

In [None]:

if not os.path.exists("../ensemble/"):
    os.makedirs("../ensemble/")
if not os.path.exists("../sub/"):
    os.makedirs("../sub/")

In [None]:
def make_oof(exp_names,n_splits=5):
    
    for exp_name in exp_names:
        if os.path.exists(f"../ensemble/{exp_name}.csv"):
            continue
        logits=[]
        labels=[]
        indexes=[]
        for f in range(n_splits):
            results_path=f"../result/{exp_name}/{f}/val_rgb_results.npz"
            results = np.load(results_path)
            logits.append(results["pred_logit"])
            labels.append(results["label"])
            indexes.append(results["original_index"])
        logits=np.concatenate(logits,axis=0)
        logits=torch.tensor(logits,dtype=torch.float64)
        preds=F.softmax(logits).numpy()
        labels=np.expand_dims(np.concatenate(labels,axis=0), axis=1)
        indexes=np.expand_dims(np.concatenate(indexes,axis=0), axis=1)
        df=pd.DataFrame(np.concatenate([indexes,labels,preds],axis=1),
                        columns=["Image_id","target","blast","brown","healthy"])
        df.to_csv(f"../ensemble/{exp_name}.csv",index=False)
    

In [None]:

exp_names=["beit_224aug","deit_384","swin_base_384v2","swin_large_192v2","swin_base_256v2cv","vit_r50","cnvxt_384","vit_384_cv","vit_224_cv"]
make_oof(exp_names)

In [None]:
PATH = '../ensemble/'
FILES = os.listdir(PATH)

OOF = np.sort( [f for f in FILES if 'oof' in f] )


In [None]:
OOF=exp_names

In [None]:
OOF_CSV = [pd.read_csv(PATH+k+".csv") for k in OOF]

print('We have %i oof files...'%len(OOF))
print(); print(OOF)


In [None]:
x = np.zeros(( len(OOF_CSV[0]),len(OOF),OOF_CSV[0].shape[1]-2 ))
for k in range(len(OOF)):
    x[:,k,:] = OOF_CSV[k].iloc[:,2:].values
    
TRUE = OOF_CSV[0].target.values

In [None]:
x.shape

In [None]:
all = []
for k in range(x.shape[1]):
    loss = log_loss(TRUE,x[:,k])
    all.append(loss)
    print('%s = %.4f'%(OOF[k],loss))
m=[np.argmin(all)]    
#m = [1]; 
w = []

In [None]:
old = np.min(all); 
RES = 400; 
PATIENCE = 10; 
TOL = 0.000
DUPLICATES = False

print('Ensemble LogLoss= %.4f by beginning with model %i'%(old,m[0]))
print()

for kk in range(len(OOF)):
    
    # BUILD CURRENT ENSEMBLE
    md = x[:,m[0]]
    for i,k in enumerate(m[1:]):
        md = w[i]*x[:,k] + (1-w[i])*md
        
    # FIND MODEL TO ADD
    mx = 10; mx_k = 0; mx_w = 0
    print('Searching for best model to add... ')
    
    # TRY ADDING EACH MODEL
    for k in range(x.shape[1]):
        #import pdb;pdb.set_trace()
        print(k,', ',end='')
        if not DUPLICATES and (k in m): continue
            
        # EVALUATE ADDING MODEL K WITH WEIGHTS W
        bst_j = 0; bst = 10; ct = 0
        for j in range(RES):
            tmp = j/RES*x[:,k] + (1-j/RES)*md
            loss = log_loss(TRUE,tmp)
            if loss<bst:
                bst = loss
                bst_j = j/RES
            else: ct += 1
            if ct>PATIENCE: break
        if bst<mx:
            mx = bst
            mx_k = k
            mx_w = bst_j
            
    # STOP IF decrease IS LESS THAN TOL
    dec = old-mx
    if dec<=TOL: 
        print(); print('No decreasing. Stopping.')
        break
        
    # DISPLAY RESULTS
    print(); #print(kk,mx,mx_k,mx_w,'%.5f'%inc)
    print('Ensemble LogLoss = %.4f after adding model %i with weight %.3f. decrease of %.4f'%(mx,mx_k,mx_w,dec))
    print()
    
    old = mx; m.append(mx_k); w.append(mx_w)

In [None]:
print('We are using models',m)
print('with weights',w)
print('and achieve ensemble AUC = %.4f'%old)

In [None]:
df = OOF_CSV[0].copy()
df.pred = md
df.to_csv('../ensemble/ensemble_oof.csv',index=False)

In [None]:
def make_sub(exp_names,n_splits=5):
    
    for exp_name in exp_names:
        if os.path.exists(f"../sub/{exp_name}.csv"):
            continue
        predictions=[]
        for f in range(n_splits):
            results_path=f"../result/{exp_name}/{f}/test_rgb_results.npz"
            results = np.load(results_path)
            logits=torch.tensor(results["pred_logit"],dtype=torch.float64)
            preds=F.softmax(logits).numpy()
            predictions.append(np.expand_dims(preds,axis=-1))
        predictions=np.concatenate(predictions,axis=-1).mean(-1)
        
        df_sub=pd.read_csv("../input/SampleSubmission.csv")
        df_sub.iloc[:,1:]=predictions
        df_sub.to_csv(f"../sub/{exp_name}.csv",index=False)
    

In [None]:
make_sub(exp_names)

In [None]:
SUB = np.sort( [f for f in FILES if 'sub' in f] )

In [None]:
SUB = exp_names

In [None]:

SUB_CSV = [pd.read_csv("../sub/"+k+".csv") for k in SUB]

print('We have %i submission files...'%len(SUB))
print(); print(SUB)

In [None]:
y = np.zeros(( len(SUB_CSV[0]),len(SUB),SUB_CSV[0].shape[1]-1  ))
for k in range(len(SUB)):
    y[:,k] = SUB_CSV[k].iloc[:,1:].values

In [None]:
y.shape

In [None]:
md2 = y[:,m[0]]
for i,k in enumerate(m[1:]):
    md2 = w[i]*y[:,k] + (1-w[i])*md2
    

In [None]:
md2.shape

In [None]:
df = SUB_CSV[0].copy()
df.target = md2
df.to_csv('../ensemble_last.csv',index=False)