In [1]:
import numpy as np
import os
import sys
import pandas as pd
import zipfile
import argparse
import requests
from tqdm import tqdm
sys.path.append("../")
from utils import *
random_state = 0

# Choose a dataset

We currently have 2 options

1. CelebA

2. Standford Chest X-ray

In [2]:
dirpath = '../data'

# CelebA

Code adopted from: https://github.com/taki0112/StarGAN-Tensorflow

## Download CelebA dataset

## Support functions

In [5]:
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()

    response = session.get(URL, params={'id': id}, stream=True)
    token = get_confirm_token(response)

    if token:
        params = {'id': id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    save_response_content(response, destination)
def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None
def save_response_content(response, destination, chunk_size=32 * 1024):
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, "wb") as f:
        for chunk in tqdm(response.iter_content(chunk_size), total=total_size,
                          unit='B', unit_scale=True, desc=destination):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)

## Download dataset

In [6]:
celebA_dir = os.path.join(dirpath, 'CelebA')
if not os.path.exists(celebA_dir):
    os.makedirs(celebA_dir)

In [None]:
file_name, drive_id = "img_align_celeba.zip", "0B7EVK8r0v71pZjFTYXZWM3FlRnM"
txt_name, txt_drive_id = "list_attr_celeba.txt", "0B7EVK8r0v71pblRyaVFSWGxPY0U"

save_path = os.path.join(dirpath, file_name)
txt_save_path = os.path.join(celebA_dir, txt_name)

if os.path.exists(txt_save_path):
    print('[*] {} already exists'.format(txt_save_path))
else:
    download_file_from_google_drive(txt_drive_id, txt_save_path)

if os.path.exists(save_path):
    print('[*] {} already exists'.format(save_path))
else:
    download_file_from_google_drive(drive_id, save_path)

with zipfile.ZipFile(save_path) as zf:
    zf.extractall(celebA_dir)

# os.remove(save_path)
os.rename(os.path.join(celebA_dir, 'img_align_celeba'), os.path.join(celebA_dir, 'images'))

## Final paths

In [7]:
celebA_dir = os.path.join('../data', 'CelebA')
image_dir = os.path.join(celebA_dir,'images')
txt_dir = os.path.join(celebA_dir,'list_attr_celeba.txt')

In [8]:
print('Image Dir: ', image_dir)
print('Label File: ',txt_dir)

('Image Dir: ', '../data/CelebA/images')
('Label File: ', '../data/CelebA/list_attr_celeba.txt')


In [9]:
# Read label file:
fp = open(txt_dir, 'r')
for i in range(5):
    print(fp.readline())

202599

5_o_Clock_Shadow Arched_Eyebrows Attractive Bags_Under_Eyes Bald Bangs Big_Lips Big_Nose Black_Hair Blond_Hair Blurry Brown_Hair Bushy_Eyebrows Chubby Double_Chin Eyeglasses Goatee Gray_Hair Heavy_Makeup High_Cheekbones Male Mouth_Slightly_Open Mustache Narrow_Eyes No_Beard Oval_Face Pale_Skin Pointy_Nose Receding_Hairline Rosy_Cheeks Sideburns Smiling Straight_Hair Wavy_Hair Wearing_Earrings Wearing_Hat Wearing_Lipstick Wearing_Necklace Wearing_Necktie Young 

000001.jpg -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1  1  1 -1  1 -1 -1  1 -1 -1  1 -1 -1 -1  1  1 -1  1 -1  1 -1 -1  1

000002.jpg -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1

000003.jpg -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1



## Divide dataset into train and test set

In [None]:
all_images = os.listdir(image_dir)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(all_images, test_size=0.33, random_state=random_state)
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
print(X_train.shape, X_test.shape)
np.save(os.path.join(celebA_dir, 'train_ids.npy'), X_train)
np.save(os.path.join(celebA_dir, 'test_ids.npy'), X_test)

## Read Label File

In [10]:
txt_dir

'../data/CelebA/list_attr_celeba.txt'

In [11]:
categories, file_names_dict = read_data_file(txt_dir)
categories = np.asarray(categories).ravel()
print(categories)

['5_o_Clock_Shadow' 'Arched_Eyebrows' 'Attractive' 'Bags_Under_Eyes'
 'Bald' 'Bangs' 'Big_Lips' 'Big_Nose' 'Black_Hair' 'Blond_Hair' 'Blurry'
 'Brown_Hair' 'Bushy_Eyebrows' 'Chubby' 'Double_Chin' 'Eyeglasses'
 'Goatee' 'Gray_Hair' 'Heavy_Makeup' 'High_Cheekbones' 'Male'
 'Mouth_Slightly_Open' 'Mustache' 'Narrow_Eyes' 'No_Beard' 'Oval_Face'
 'Pale_Skin' 'Pointy_Nose' 'Receding_Hairline' 'Rosy_Cheeks' 'Sideburns'
 'Smiling' 'Straight_Hair' 'Wavy_Hair' 'Wearing_Earrings' 'Wearing_Hat'
 'Wearing_Lipstick' 'Wearing_Necklace' 'Wearing_Necktie' 'Young']


In [12]:
print("Number of images: ", len(file_names_dict.keys()))
print("Few image names:")
file_names_dict.keys()[0:5]

('Number of images: ', 202599)
Few image names:


['083541.jpg', '073252.jpg', '039856.jpg', '080233.jpg', '127099.jpg']

In [13]:
label = file_names_dict[file_names_dict.keys()[0]]
print(type(label))
label = np.asarray(label)
print(label.ravel())

<type 'list'>
[-1.  1.  1.  1. -1. -1.  1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
  1.  1. -1.  1. -1. -1.  1. -1. -1. -1. -1. -1. -1.  1. -1.  1.  1. -1.
  1. -1. -1.  1.]


## Create Binary-Classification Data file

In [25]:
# Convert the dictionary: attr_list to a dataframe
df = pd.DataFrame(file_names_dict).T
df['Image_Path'] = df.index
print(df.shape)
df.head(2)

(202599, 41)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,Image_Path
000001.jpg,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,000001.jpg
000002.jpg,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,000002.jpg


In [27]:
# Target attribute for binary classification
attribute = ['Young']
index_main = []
for a in attribute:
    print(a)
    index = np.where(np.asarray(categories) == a)
    index = index[0][0]
    index_main.append(index)
print(index_main)

Young
[39]


## Write the label file for target attribute binary classification

In [29]:
#Train File
df_temp = df[['Image_Path']+ index_main]
file_name = ''.join(attribute)+'_binary_classification.txt'
df_temp.to_csv(os.path.join(celebA_dir, file_name ),sep = ' ', index = None, header = None)
print(df_temp.shape)
one_line = str(df_temp.shape[0]) + '\n'
second_line = ''.join(attribute)+ "\n"
with open(os.path.join(celebA_dir, file_name), 'r+') as fp:
    lines = fp.readlines()     # lines is list of line, each element '...\n'
    lines.insert(0, one_line)  # you can use any index if you know the line index
    lines.insert(1, second_line)
    fp.seek(0)                 # file pointer locates at the beginning to write the whole file again
    fp.writelines(lines) 
fp = open(os.path.join(celebA_dir, file_name), 'rw')
print(fp.readline())
print(fp.readline())
print(fp.readline())
print(fp.readline())
print(fp.readline())
print(fp.readline())
fp.close()

(202599, 2)
202599

Young

000001.jpg 1.0

000002.jpg 1.0

000003.jpg 1.0

000004.jpg 1.0



### Read saved files

In [32]:
file_name = attribute+'_binary_classification.txt'
categories, file_names_dict = read_data_file(os.path.join(celebA_dir, file_name),image_dir)
categories = np.asarray(categories).ravel()
print(categories)

['Young']


In [None]:
print("Number of images: ", len(file_names_dict.keys()))
print("Few image names:")
file_names_dict.keys()[0:5]

In [34]:
label = file_names_dict[file_names_dict.keys()[0]]
print(type(label))
label = np.asarray(label)
print(label.ravel())

<type 'list'>
[1.]
