In [1]:
import pandas as pd
import numpy as np

In [2]:
preds_t = np.load('../cache/sub_class_preds-33-1.npy')

In [3]:
name_label_dict = {
0:  'Nucleoplasm',
1:  'Nuclear membrane',
2:  'Nucleoli',   
3:  'Nucleoli fibrillar center',
4:  'Nuclear speckles',
5:  'Nuclear bodies',
6:  'Endoplasmic reticulum',   
7:  'Golgi apparatus',
8:  'Peroxisomes',
9:  'Endosomes',
10:  'Lysosomes',
11:  'Intermediate filaments',
12:  'Actin filaments',
13:  'Focal adhesion sites',   
14:  'Microtubules',
15:  'Microtubule ends',  
16:  'Cytokinetic bridge',   
17:  'Mitotic spindle',
18:  'Microtubule organizing center',  
19:  'Centrosome',
20:  'Lipid droplets',
21:  'Plasma membrane',   
22:  'Cell junctions', 
23:  'Mitochondria',
24:  'Aggresome',
25:  'Cytosol',
26:  'Cytoplasmic bodies',   
27:  'Rods & rings' }

In [7]:
PATH = './'
TRAIN = '../data/train/'
ED = '../data/external_data_1/'
TEST = '../data/test/'
LABELS = '../data/train.csv'
SAMPLE = '../data/sample_submission.csv'
ED_LABELS='../data/external_data.csv'

In [10]:
def save_pred(pred, th=0.5, fname='../submissions/sub33-mag.csv'):
    pred_list = []
    for line in pred:
        s = ' '.join(list([str(i) for i in np.nonzero(line>th)[0]]))
        pred_list.append(s)
        
    sample_df = pd.read_csv(SAMPLE)
    sample_list = list(sample_df.Id)
#     pred_dic = dict((key, value) for (key, value) 
#                 in zip(learner.data.test_ds.fnames,pred_list))
#     pred_list_cor = [pred_dic[id] for id in sample_list]
    df = pd.DataFrame({'Id':sample_list,'Predicted':pred_list})
    df.to_csv(fname, header=True, index=False)

In [11]:
th_t = np.array([0.565,0.39,0.55,0.345,0.33,0.39,0.33,0.45,0.38,0.39,
               0.34,0.42,0.31,0.38,0.49,0.50,0.38,0.43,0.46,0.40,
               0.39,0.505,0.37,0.47,0.41,0.545,0.32,0.1])
print('Fractions: ',(preds_t > th_t).mean(axis=0))
save_pred(preds_t,th_t)

Fractions:  [0.4220646  0.03999316 0.0896428  0.04289865 0.04674415 0.07366262
 0.03802769 0.0819518  0.00145274 0.00170911 0.00128183 0.04007862
 0.02734575 0.01880021 0.0483678  0.         0.0374295  0.02683302
 0.04110408 0.05768245 0.00880191 0.17262006 0.02674756 0.08408819
 0.01128012 0.32669629 0.02238934 0.        ]


In [12]:
lb_prob = [
 0.362397820,0.043841336,0.075268817,0.059322034,0.075268817,
 0.075268817,0.043841336,0.075268817,0.010000000,0.010000000,
 0.010000000,0.043841336,0.043841336,0.014198783,0.043841336,
 0.010000000,0.028806584,0.014198783,0.028806584,0.059322034,
 0.010000000,0.126126126,0.028806584,0.075268817,0.010000000,
 0.222493880,0.028806584,0.010000000]

In [13]:
def Count_soft(preds,th=0.5,d=50.0):
    preds = sigmoid_np(d*(preds - th))
    return preds.mean(axis=0)

def fit_test(x,y):
    params = 0.5*np.ones(len(name_label_dict))
    wd = 1e-5
    error = lambda p: np.concatenate((Count_soft(x,p) - y,
                                      wd*(p - 0.5)), axis=None)
    p, success = opt.leastsq(error, params)
    return p

In [17]:
from sklearn.metrics import f1_score
import scipy.optimize as opt

def sigmoid_np(x):
    return 1.0/(1.0 + np.exp(-x))

th_t = fit_test(preds_t,lb_prob)
th_t[th_t<0.1] = 0.1
print('Thresholds: ',th_t)
print('Fractions: ',(preds_t > th_t).mean(axis=0))
print('Fractions (th = 0.5): ',(preds_t > 0.5).mean(axis=0))

Thresholds:  [0.81591315 0.28877957 0.78879537 0.15943895 0.11398476 0.37912191
 0.26305797 0.54842994 0.10616127 0.10441183 0.1        0.34769215
 0.1268775  0.65420633 0.64510756 0.1        0.47928241 0.73931002
 0.6669014  0.3855292  0.3143061  0.78808739 0.33254019 0.60723809
 0.52003264 0.8394617  0.22158424 0.1       ]
Fractions:  [0.36258759 0.04383866 0.07554264 0.05742608 0.0694753  0.07511536
 0.04324047 0.0756281  0.00470005 0.00410186 0.00316185 0.04409503
 0.04101863 0.01427106 0.04383866 0.00059819 0.02845667 0.01410015
 0.02896941 0.05956247 0.00991284 0.12596137 0.02888395 0.07537173
 0.00991284 0.22235515 0.0280294  0.        ]
Fractions (th = 0.5):  [0.43667749 0.03802769 0.09280465 0.03554948 0.0405059  0.06332251
 0.02768757 0.07896086 0.00111092 0.00128183 0.00102547 0.03708768
 0.02196206 0.016322   0.04794052 0.         0.02700393 0.02307298
 0.03794223 0.0491369  0.00717826 0.17364553 0.02196206 0.08169544
 0.01008375 0.34412921 0.01623654 0.        ]


In [19]:
save_pred(preds_t,th_t,'../submissions/sub33-mag-f.csv')

In [22]:
# save_pred(preds_t,th,'../submissions/sub33-mag-v.csv')
save_pred(preds_t,0.5,'../submissions/sub33-mag-05.csv')

In [24]:
class_list = [8,9,10,15,20,24,27]
for i in class_list:
    th_t[i] = 0.35
save_pred(preds_t,th_t,'../submissions/sub33-mag-c.csv')

In [25]:
labels = pd.read_csv(LABELS).set_index('Id')
label_count = np.zeros(len(name_label_dict))
for label in labels['Target']:
    l = [int(i) for i in label.split()]
    label_count += np.eye(len(name_label_dict))[l].sum(axis=0)
label_fraction = label_count.astype(np.float)/len(labels)
label_count, label_fraction

(array([1.2885e+04, 1.2540e+03, 3.6210e+03, 1.5610e+03, 1.8580e+03,
        2.5130e+03, 1.0080e+03, 2.8220e+03, 5.3000e+01, 4.5000e+01,
        2.8000e+01, 1.0930e+03, 6.8800e+02, 5.3700e+02, 1.0660e+03,
        2.1000e+01, 5.3000e+02, 2.1000e+02, 9.0200e+02, 1.4820e+03,
        1.7200e+02, 3.7770e+03, 8.0200e+02, 2.9650e+03, 3.2200e+02,
        8.2280e+03, 3.2800e+02, 1.1000e+01]),
 array([4.14682029e-01, 4.03578785e-02, 1.16535788e-01, 5.02381565e-02,
        5.97966014e-02, 8.08766735e-02, 3.24407827e-02, 9.08213182e-02,
        1.70571576e-03, 1.44824923e-03, 9.01132853e-04, 3.51763646e-02,
        2.21421215e-02, 1.72824408e-02, 3.43074150e-02, 6.75849640e-04,
        1.70571576e-02, 6.75849640e-03, 2.90293512e-02, 4.76956746e-02,
        5.53553038e-03, 1.21556385e-01, 2.58110196e-02, 9.54235324e-02,
        1.03630278e-02, 2.64804325e-01, 1.05561277e-02, 3.54016478e-04]))

In [27]:
th_t = fit_test(preds_t,label_fraction)
th_t[th_t<0.05] = 0.05
print('Thresholds: ',th_t)
print('Fractions: ',(preds_t > th_t).mean(axis=0))
save_pred(preds_t,th_t,'../submissions/sub33-mag-t.csv')

Thresholds:  [0.59822428 0.38594474 0.29164818 0.22857615 0.1805336  0.33709688
 0.413633   0.36968473 0.3242188  0.45159205 0.54964007 0.5467044
 0.49187499 0.44022057 0.96584261 0.15993161 0.72001566 0.99472443
 0.66236925 0.52605875 0.62532512 0.81780978 0.38945425 0.35459486
 0.4900673  0.71981957 0.76618942 0.49715037]
Fractions:  [4.15228166e-01 4.02495300e-02 1.16646727e-01 4.94787216e-02
 5.91351906e-02 8.04990600e-02 3.22167151e-02 9.09246283e-02
 1.70910955e-03 1.45274312e-03 8.54554777e-04 3.51222013e-02
 2.21329687e-02 1.73474620e-02 3.43531020e-02 3.41821911e-04
 1.70910955e-02 6.83643822e-03 2.92257734e-02 4.74277901e-02
 5.64006153e-03 1.21432234e-01 2.57220988e-02 9.51974022e-02
 1.02546573e-02 2.64570159e-01 1.05110238e-02 0.00000000e+00]


In [2]:
import requests
import pandas as pd

In [2]:
colors = ['red','green','blue','yellow']
DIR = "../data/HPAv18/jpg/"
v18_url = 'http://v18.proteinatlas.org/images/'

In [3]:
imgList = pd.read_csv("../data/HPAv18RBGY_wodpl.csv")

In [4]:
len(imgList)

74606

In [33]:
for i in imgList['Id'][:5]: # [:5] means downloard only first 5 samples, if it works, please remove it
    img = i.split('_')
    for color in colors:
        img_path = img[0] + '/' + "_".join(img[1:]) + "_" + color + ".jpg"
        img_name = i + "_" + color + ".jpg"
        img_url = v18_url + img_path
        r = requests.get(img_url, allow_redirects=True)
        open(DIR + img_name, 'wb').write(r.content)

In [34]:
!du -sh {DIR}

3.0M	../data/HPAv18/jpg/


In [35]:
!ls -a {DIR}

.			    10580_1756_B1_1_green.jpg
..			    10580_1756_B1_1_red.jpg
10580_1610_C1_1_blue.jpg    10580_1756_B1_1_yellow.jpg
10580_1610_C1_1_green.jpg   10580_1756_B1_2_blue.jpg
10580_1610_C1_1_red.jpg     10580_1756_B1_2_green.jpg
10580_1610_C1_1_yellow.jpg  10580_1756_B1_2_red.jpg
10580_1610_C1_2_blue.jpg    10580_1756_B1_2_yellow.jpg
10580_1610_C1_2_green.jpg   10580_1758_B1_1_blue.jpg
10580_1610_C1_2_red.jpg     10580_1758_B1_1_green.jpg
10580_1610_C1_2_yellow.jpg  10580_1758_B1_1_red.jpg
10580_1756_B1_1_blue.jpg    10580_1758_B1_1_yellow.jpg


In [10]:
from tqdm import tqdm
for idx, i in tqdm(enumerate(imgList['Id'])): # [:5] means downloard only first 5 samples, if it works, please remove it
    if idx < 218:
        continue
    img = i.split('_')
    for color in colors:
        img_path = img[0] + '/' + "_".join(img[1:]) + "_" + color + ".jpg"
        img_name = i + "_" + color + ".jpg"
        img_url = v18_url + img_path
        r = requests.get(img_url, allow_redirects=True)
        open(DIR + img_name, 'wb').write(r.content)

543it [47:02,  5.20s/it]

KeyboardInterrupt: 

In [5]:
imgList.iloc[0]['Id']

'10580_1610_C1_1'

In [6]:
# !pip install joblib


In [11]:
from pathlib import Path


img_name = '10580_1610_C1_1'+'_red.jpg'
my_file = Path(DIR+img_name)
print(my_file)
if my_file.exists():
    print('exists')

../data/HPAv18/jpg/10580_1610_C1_1_red.jpg
exists


In [14]:
from joblib import Parallel, delayed
from pathlib import Path
import requests
import pandas as pd

def save_to_dir(i):
#     i = imgList.iloc[ii]['Id']
    img = i.split('_')
    for color in colors:
        img_path = img[0] + '/' + "_".join(img[1:]) + "_" + color + ".jpg"
        img_name = i + "_" + color + ".jpg"
        my_file = Path(DIR+img_name)
        if my_file.exists():
            continue
        img_url = v18_url + img_path
        r = requests.get(img_url, allow_redirects=True)
        open(DIR + img_name, 'wb').write(r.content)
num_cores = 8
Parallel(n_jobs=num_cores, prefer="threads")(delayed(save_to_dir)(i) for i in imgList['Id'])

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [16]:
74606 + 31072

105678

In [17]:
105678 * 4

422712