In [2]:
import numpy as np
import os
import sys
import pandas as pd
import zipfile
import argparse
import requests
from tqdm import tqdm
sys.path.append("../")
from utils import *
random_state = 0

# Choose a dataset

We currently have 2 options

1. CelebA

2. Standford Chest X-ray

In [3]:
dirpath = '../data'

# CelebA

Code adopted from: https://github.com/taki0112/StarGAN-Tensorflow

## Download CelebA dataset

## Support functions

In [4]:
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()

    response = session.get(URL, params={'id': id}, stream=True)
    token = get_confirm_token(response)

    if token:
        params = {'id': id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    save_response_content(response, destination)
def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None
def save_response_content(response, destination, chunk_size=32 * 1024):
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, "wb") as f:
        for chunk in tqdm(response.iter_content(chunk_size), total=total_size,
                          unit='B', unit_scale=True, desc=destination):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)

## Download dataset

In [5]:
celebA_dir = os.path.join(dirpath, 'CelebA')
if not os.path.exists(celebA_dir):
    os.makedirs(celebA_dir)

In [None]:
file_name, drive_id = "img_align_celeba.zip", "0B7EVK8r0v71pZjFTYXZWM3FlRnM"
txt_name, txt_drive_id = "list_attr_celeba.txt", "0B7EVK8r0v71pblRyaVFSWGxPY0U"

save_path = os.path.join(dirpath, file_name)
txt_save_path = os.path.join(celebA_dir, txt_name)

if os.path.exists(txt_save_path):
    print('[*] {} already exists'.format(txt_save_path))
else:
    download_file_from_google_drive(txt_drive_id, txt_save_path)

if os.path.exists(save_path):
    print('[*] {} already exists'.format(save_path))
else:
    download_file_from_google_drive(drive_id, save_path)

with zipfile.ZipFile(save_path) as zf:
    zf.extractall(celebA_dir)

# os.remove(save_path)
os.rename(os.path.join(celebA_dir, 'img_align_celeba'), os.path.join(celebA_dir, 'images'))

## Final paths

In [14]:
# The data is already saved at:
#image_dir = '/pghbio/dbmi/batmanlab/singla/MICCAI_2019/GAN_Interpretability/data/celebA/images'


In [30]:
celebA_dir = os.path.join('../data', 'CelebA')
image_dir = os.path.join(celebA_dir,'images')
txt_dir = os.path.join(celebA_dir,'list_attr_celeba.txt')

In [31]:
print('Image Dir: ', image_dir)
print('Label File: ',txt_dir)

('Image Dir: ', '../data/CelebA/images')
('Label File: ', '../data/CelebA/list_attr_celeba.txt')


In [20]:
# Read label file:
fp = open(txt_dir, 'r')
for i in range(5):
    print(fp.readline())

202599

5_o_Clock_Shadow Arched_Eyebrows Attractive Bags_Under_Eyes Bald Bangs Big_Lips Big_Nose Black_Hair Blond_Hair Blurry Brown_Hair Bushy_Eyebrows Chubby Double_Chin Eyeglasses Goatee Gray_Hair Heavy_Makeup High_Cheekbones Male Mouth_Slightly_Open Mustache Narrow_Eyes No_Beard Oval_Face Pale_Skin Pointy_Nose Receding_Hairline Rosy_Cheeks Sideburns Smiling Straight_Hair Wavy_Hair Wearing_Earrings Wearing_Hat Wearing_Lipstick Wearing_Necklace Wearing_Necktie Young 

000001.jpg -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1  1  1 -1  1 -1 -1  1 -1 -1  1 -1 -1 -1  1  1 -1  1 -1  1 -1 -1  1

000002.jpg -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1

000003.jpg -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1



## Divide dataset into train and test set

In [16]:
all_images = os.listdir(image_dir)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(all_images, test_size=0.33, random_state=random_state)
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
print(X_train.shape, X_test.shape)
np.save(os.path.join(celebA_dir, 'train_ids.npy'), X_train)
np.save(os.path.join(celebA_dir, 'test_ids.npy'), X_test)

  from .murmurhash import murmurhash3_32
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .sparsefuncs_fast import csr_row_norms
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


((135741,), (66858,))


  from ._random import sample_without_replacement


## Read Label File

In [32]:
txt_dir

'../data/CelebA/list_attr_celeba.txt'

In [36]:
categories, file_names_dict = read_data_file(txt_dir,image_dir)
categories = np.asarray(categories).ravel()
print(categories)

['5_o_Clock_Shadow' 'Arched_Eyebrows' 'Attractive' 'Bags_Under_Eyes'
 'Bald' 'Bangs' 'Big_Lips' 'Big_Nose' 'Black_Hair' 'Blond_Hair' 'Blurry'
 'Brown_Hair' 'Bushy_Eyebrows' 'Chubby' 'Double_Chin' 'Eyeglasses'
 'Goatee' 'Gray_Hair' 'Heavy_Makeup' 'High_Cheekbones' 'Male'
 'Mouth_Slightly_Open' 'Mustache' 'Narrow_Eyes' 'No_Beard' 'Oval_Face'
 'Pale_Skin' 'Pointy_Nose' 'Receding_Hairline' 'Rosy_Cheeks' 'Sideburns'
 'Smiling' 'Straight_Hair' 'Wavy_Hair' 'Wearing_Earrings' 'Wearing_Hat'
 'Wearing_Lipstick' 'Wearing_Necklace' 'Wearing_Necktie' 'Young']


In [37]:
print("Number of images: ", len(file_names_dict.keys()))
print("Few image names:")
file_names_dict.keys()[0:5]

('Number of images: ', 202599)
Few image names:


['../data/CelebA/images/141713.jpg',
 '../data/CelebA/images/149435.jpg',
 '../data/CelebA/images/094192.jpg',
 '../data/CelebA/images/099478.jpg',
 '../data/CelebA/images/196329.jpg']

In [38]:
label = file_names_dict[file_names_dict.keys()[0]]
print(type(label))
label = np.asarray(label)
print(label.ravel())

<type 'list'>
[-1. -1. -1. -1. -1. -1. -1.  1.  1. -1. -1. -1. -1.  1. -1. -1. -1. -1.
 -1.  1.  1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1.  1. -1. -1. -1. -1.
 -1. -1. -1. -1.]


## Create Binary-Classification Data file

In [39]:
# Convert the dictionary: attr_list to a dataframe
df = pd.DataFrame(file_names_dict).T
df['Image_Path'] = df.index
print(df.shape)
df.head(2)

(202599, 41)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,Image_Path
../data/CelebA/images/000001.jpg,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,../data/CelebA/images/000001.jpg
../data/CelebA/images/000002.jpg,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,../data/CelebA/images/000002.jpg


In [17]:
train_ids = np.load(os.path.join(celebA_dir,'train_ids.npy'))
test_ids = np.load(os.path.join(celebA_dir,'test_ids.npy'))
df_train = df.loc[df['Image_Path'].isin(train_ids)]
df_train = df_train.replace(-1, 0)
print(df_train.shape)
df_test = df.loc[df['Image_Path'].isin(test_ids)]
df_test = df_test.replace(-1, 0)
print(df_test.shape)
df_test.head(2)

(135741, 41)
(66858, 41)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,Image_Path
000002.jpg,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,000002.jpg
000003.jpg,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,000003.jpg


In [18]:
# Target attribute for binary classification
attribute = 'Young'
index = np.where(np.asarray(categories) == attribute)
index = index[0][0]
print(index)

39


## Write the label file for target attribute binary classification

In [20]:
#Train File
df_temp = df_train[['Image_Path', index]]
file_name = attribute+'_binary_classification_train.txt'
df_temp.to_csv(os.path.join(celebA_dir, file_name ),sep = ' ', index = None, header = None)
print(df_temp.shape)
one_line = str(df_temp.shape[0]) + '\n'
second_line = attribute+ "\n"
with open(os.path.join(celebA_dir, file_name), 'r+') as fp:
    lines = fp.readlines()     # lines is list of line, each element '...\n'
    lines.insert(0, one_line)  # you can use any index if you know the line index
    lines.insert(1, second_line)
    fp.seek(0)                 # file pointer locates at the beginning to write the whole file again
    fp.writelines(lines) 
fp = open(os.path.join(celebA_dir, file_name), 'rw')
print(fp.readline())
print(fp.readline())
print(fp.readline())
print(fp.readline())
print(fp.readline())
print(fp.readline())
fp.close()

(135741, 2)
135741

Young

000001.jpg 1.0

000005.jpg 1.0

000006.jpg 1.0

000010.jpg 1.0



In [21]:
#Test File
df_temp = df_test[['Image_Path', index]]
file_name = attribute+'_binary_classification_test.txt'
df_temp.to_csv(os.path.join(celebA_dir, file_name ),sep = ' ', index = None, header = None)
print(df_temp.shape)
one_line = str(df_temp.shape[0]) + '\n'
second_line = attribute+ "\n"
with open(os.path.join(celebA_dir, file_name), 'r+') as fp:
    lines = fp.readlines()     # lines is list of line, each element '...\n'
    lines.insert(0, one_line)  # you can use any index if you know the line index
    lines.insert(1, second_line)
    fp.seek(0)                 # file pointer locates at the beginning to write the whole file again
    fp.writelines(lines) 

(66858, 2)


### Read saved files

In [22]:
image_dir = '/pghbio/dbmi/batmanlab/singla/MICCAI_2019/GAN_Interpretability/data/celebA/images'
file_name = attribute+'_binary_classification_test.txt'
categories, file_names_dict = read_data_file(os.path.join(celebA_dir, file_name),image_dir)
categories = np.asarray(categories).ravel()
print(categories)

['Young']


In [23]:
print("Number of images: ", len(file_names_dict.keys()))
print("Few image names:")
file_names_dict.keys()[0:5]

('Number of images: ', 66858)
Few image names:


['/pghbio/dbmi/batmanlab/singla/MICCAI_2019/GAN_Interpretability/data/celebA/images/202579.jpg',
 '/pghbio/dbmi/batmanlab/singla/MICCAI_2019/GAN_Interpretability/data/celebA/images/049289.jpg',
 '/pghbio/dbmi/batmanlab/singla/MICCAI_2019/GAN_Interpretability/data/celebA/images/037112.jpg',
 '/pghbio/dbmi/batmanlab/singla/MICCAI_2019/GAN_Interpretability/data/celebA/images/161036.jpg',
 '/pghbio/dbmi/batmanlab/singla/MICCAI_2019/GAN_Interpretability/data/celebA/images/191775.jpg']

In [24]:
label = file_names_dict[file_names_dict.keys()[0]]
print(type(label))
label = np.asarray(label)
print(label.ravel())

<type 'list'>
[1.]


# Chest X-Ray

# Create file for 