In [20]:
import numpy as np
import os
import sys
import pandas as pd
import zipfile
import argparse
import requests
from tqdm import tqdm
sys.path.append("../")
from utils import *
random_state = 0

# Choose a dataset

We currently have 2 options

1. CelebA

2. Standford Chest X-ray

In [21]:
dirpath = '../data'

# CelebA

Code adopted from: https://github.com/taki0112/StarGAN-Tensorflow

## Download CelebA dataset

## Support functions

In [22]:
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()

    response = session.get(URL, params={'id': id}, stream=True)
    token = get_confirm_token(response)

    if token:
        params = {'id': id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    save_response_content(response, destination)
def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None
def save_response_content(response, destination, chunk_size=32 * 1024):
    total_size = int(response.headers.get('content-length', 0))
    with open(destination, "wb") as f:
        for chunk in tqdm(response.iter_content(chunk_size), total=total_size,
                          unit='B', unit_scale=True, desc=destination):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)

## Download dataset

In [23]:
celebA_dir = os.path.join(dirpath, 'CelebA')
if not os.path.exists(celebA_dir):
    os.makedirs(celebA_dir)

In [24]:
celebA_dir

'../data/CelebA'

In [25]:
file_name, drive_id = "img_align_celeba.zip", "0B7EVK8r0v71pZjFTYXZWM3FlRnM"
txt_name, txt_drive_id = "list_attr_celeba.txt", "0B7EVK8r0v71pblRyaVFSWGxPY0U"

save_path = os.path.join(dirpath, file_name)
txt_save_path = os.path.join(celebA_dir, txt_name)

if os.path.exists(txt_save_path):
    print('[*] {} already exists'.format(txt_save_path))
else:
    download_file_from_google_drive(txt_drive_id, txt_save_path)

if os.path.exists(save_path):
    print('[*] {} already exists'.format(save_path))
else:
    download_file_from_google_drive(drive_id, save_path)

unzipped = os.path.exists(os.path.join(celebA_dir, 'img_align_celeba'))
if unzipped:
    print ('[*] already unzipped at {}.'.format(os.path.join(celebA_dir, 'img_align_celeba')))
else:
    with zipfile.ZipFile(save_path) as zf:
        zf.extractall(celebA_dir)
# os.remove(save_path)
my_images_dir = os.path.join(celebA_dir, 'images')
if os.path.exists(my_images_dir):
    print('[*] {} already exists'.format(my_images_dir))
else:
    os.rename(os.path.join(celebA_dir, 'img_align_celeba'), my_images_dir)

[*] ../data/CelebA/list_attr_celeba.txt already exists
[*] ../data/img_align_celeba.zip already exists
[*] already unzipped at ../data/CelebA/img_align_celeba.
[*] ../data/CelebA/images already exists


## Final paths

In [26]:
celebA_dir = os.path.join('../data', 'CelebA')
image_dir = os.path.join(celebA_dir,'images')
txt_dir = os.path.join(celebA_dir,'list_attr_celeba.txt')

In [27]:
print('Image Dir: ', image_dir)
print('Label File: ',txt_dir)

Image Dir:  ../data/CelebA/images
Label File:  ../data/CelebA/list_attr_celeba.txt


In [28]:
# Read label file:
fp = open(txt_dir, 'r')
for i in range(5):
    print(fp.readline())

202599

5_o_Clock_Shadow Arched_Eyebrows Attractive Bags_Under_Eyes Bald Bangs Big_Lips Big_Nose Black_Hair Blond_Hair Blurry Brown_Hair Bushy_Eyebrows Chubby Double_Chin Eyeglasses Goatee Gray_Hair Heavy_Makeup High_Cheekbones Male Mouth_Slightly_Open Mustache Narrow_Eyes No_Beard Oval_Face Pale_Skin Pointy_Nose Receding_Hairline Rosy_Cheeks Sideburns Smiling Straight_Hair Wavy_Hair Wearing_Earrings Wearing_Hat Wearing_Lipstick Wearing_Necklace Wearing_Necktie Young 

000001.jpg -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1  1  1 -1  1 -1 -1  1 -1 -1  1 -1 -1 -1  1  1 -1  1 -1  1 -1 -1  1

000002.jpg -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1

000003.jpg -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1



## Divide dataset into train and test set

In [29]:
all_images = os.listdir(image_dir)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(all_images, test_size=0.33, random_state=random_state)
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
print(X_train.shape, X_test.shape)
np.save(os.path.join(celebA_dir, 'train_ids.npy'), X_train)
np.save(os.path.join(celebA_dir, 'test_ids.npy'), X_test)

(135741,) (66858,)


## Read Label File

In [30]:
txt_dir

'../data/CelebA/list_attr_celeba.txt'

In [31]:
categories, file_names_dict = read_data_file(txt_dir)
categories = np.asarray(categories).ravel()
print(categories)

['5_o_Clock_Shadow' 'Arched_Eyebrows' 'Attractive' 'Bags_Under_Eyes'
 'Bald' 'Bangs' 'Big_Lips' 'Big_Nose' 'Black_Hair' 'Blond_Hair' 'Blurry'
 'Brown_Hair' 'Bushy_Eyebrows' 'Chubby' 'Double_Chin' 'Eyeglasses'
 'Goatee' 'Gray_Hair' 'Heavy_Makeup' 'High_Cheekbones' 'Male'
 'Mouth_Slightly_Open' 'Mustache' 'Narrow_Eyes' 'No_Beard' 'Oval_Face'
 'Pale_Skin' 'Pointy_Nose' 'Receding_Hairline' 'Rosy_Cheeks' 'Sideburns'
 'Smiling' 'Straight_Hair' 'Wavy_Hair' 'Wearing_Earrings' 'Wearing_Hat'
 'Wearing_Lipstick' 'Wearing_Necklace' 'Wearing_Necktie' 'Young']


In [32]:
print("Number of images: ", len(file_names_dict.keys()))
print("Few image names:")
list(file_names_dict.keys())[0:5]

Number of images:  202599
Few image names:


['040892.jpg', '058624.jpg', '175595.jpg', '123543.jpg', '025808.jpg']

In [33]:
label = file_names_dict[list(file_names_dict.keys())[0]]
print(type(label))
label = np.asarray(label)
print(label.ravel())

<class 'list'>
[-1. -1. -1.  1. -1. -1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1.  1. -1. -1. -1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1.  1.]


## Create Binary-Classification Data file

In [34]:
# Convert the dictionary: attr_list to a dataframe
df = pd.DataFrame(file_names_dict).T
df['Image_Path'] = df.index
print(df.shape)
df.head(2)

(202599, 41)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,Image_Path
000001.jpg,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,000001.jpg
000002.jpg,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,000002.jpg


In [35]:
def find_attribute_index(attribute):
    # attribute: Target attribute for binary classification
    index_main = []
    for a in attribute:
        print(a)
        index = np.where(np.asarray(categories) == a)
        index = index[0][0]
        index_main.append(index)
    print(index_main)
    return index_main

## Write the label file for target attribute binary classification

In [36]:
def write_attribute_label_file(attribute):
    index_main = find_attribute_index(attribute)
    #Train File
    df_temp = df[['Image_Path']+ index_main]
    file_name = ''.join(attribute)+'_binary_classification.txt'
    df_temp.to_csv(os.path.join(celebA_dir, file_name ),sep = ' ', index = None, header = None)
    print(df_temp.shape)
    one_line = str(df_temp.shape[0]) + '\n'
    second_line = ''.join(attribute)+ "\n"
    with open(os.path.join(celebA_dir, file_name), 'r+') as fp:
        lines = fp.readlines()     # lines is list of line, each element '...\n'
        lines.insert(0, one_line)  # you can use any index if you know the line index
        lines.insert(1, second_line)
        fp.seek(0)                 # file pointer locates at the beginning to write the whole file again
        fp.writelines(lines) 

In [37]:
attributes = [['Smiling'], ['Young'], ['No_Beard'], ['Heavy_Makeup'], ['Black_Hair'], ['Bangs']]
for attribute in attributes:
     write_attribute_label_file(attribute)

Smiling
[31]
(202599, 2)
Young
[39]
(202599, 2)
No_Beard
[24]
(202599, 2)
Heavy_Makeup
[18]
(202599, 2)
Black_Hair
[8]
(202599, 2)
Bangs
[5]
(202599, 2)


### Read saved files

In [38]:
def read_saved_files(attribute):
    file_name = ''.join(attribute)+'_binary_classification.txt'
    categories, file_names_dict = read_data_file(os.path.join(celebA_dir, file_name), image_dir)
    categories = np.asarray(categories).ravel()
    print(categories)
    
    print("Number of images: ", len(file_names_dict.keys()))
    print("Few image names:")
    list(file_names_dict.keys())[0:5]
    
    label = file_names_dict[list(file_names_dict.keys())[0]]
    print(type(label))
    label = np.asarray(label)
    print(label.ravel())
    
for attribute in attributes:
     read_saved_files(attribute)

['Smiling']
Number of images:  202599
Few image names:
<class 'list'>
[-1.]
['Young']
Number of images:  202599
Few image names:
<class 'list'>
[1.]
['No_Beard']
Number of images:  202599
Few image names:
<class 'list'>
[1.]
['Heavy_Makeup']
Number of images:  202599
Few image names:
<class 'list'>
[1.]
['Black_Hair']
Number of images:  202599
Few image names:
<class 'list'>
[1.]
['Bangs']
Number of images:  202599
Few image names:
<class 'list'>
[-1.]
