In [40]:
import os
import gdown
import shutil
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.utils import image_dataset_from_directory

## Downloading Datasets

### Folder Images

### List Attribute

In [2]:
list_attribute_url = 'https://drive.google.com/file/d/1Q0kgTKibawWFUX5zTLaVUP0NJs9dcVDt/view?usp=drive_link'
gdown.download(list_attribute_url, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1Q0kgTKibawWFUX5zTLaVUP0NJs9dcVDt
To: D:\nitip adit\Bootcamp\Project_1\gender-classification-based-on-face-recognition\list_attribute.txt
100%|█████████████████████████████████████████████████████████████████████████████| 26.7M/26.7M [00:07<00:00, 3.80MB/s]


'list_attribute.txt'

## Splitting Dataset

In [3]:
attribute_list_df = pd.read_csv('list_attribute.txt', sep='\s+', skiprows=1, usecols=['Male'])
attribute_list_df.index.name = 'filename'
attribute_list_df.replace(to_replace=-1, value=0, inplace=True)
attribute_list_df.rename(columns={"Male": "Gender"}, inplace=True)
attribute_list_df

Unnamed: 0_level_0,Gender
filename,Unnamed: 1_level_1
000001.jpg,0
000002.jpg,0
000003.jpg,1
000004.jpg,0
000005.jpg,0
...,...
202595.jpg,0
202596.jpg,1
202597.jpg,1
202598.jpg,0


In [4]:
attribute_list_df.reset_index(drop=False, inplace=True)

In [5]:
attribute_list_df

Unnamed: 0,filename,Gender
0,000001.jpg,0
1,000002.jpg,0
2,000003.jpg,1
3,000004.jpg,0
4,000005.jpg,0
...,...,...
202594,202595.jpg,0
202595,202596.jpg,1
202596,202597.jpg,1
202597,202598.jpg,0


In [6]:
images_folder = os.listdir('Images')
attribute_list_df['isExist'] = attribute_list_df['filename'].isin(images_folder)
attribute_list_df

Unnamed: 0,filename,Gender,isExist
0,000001.jpg,0,False
1,000002.jpg,0,False
2,000003.jpg,1,False
3,000004.jpg,0,False
4,000005.jpg,0,False
...,...,...,...
202594,202595.jpg,0,False
202595,202596.jpg,1,False
202596,202597.jpg,1,False
202597,202598.jpg,0,False


In [7]:
attribute_list_df.drop(attribute_list_df[attribute_list_df['isExist'] == False].index, inplace=True)
attribute_list_df

Unnamed: 0,filename,Gender,isExist
50,000051.jpg,1,True
51,000052.jpg,1,True
64,000065.jpg,1,True
165,000166.jpg,1,True
197,000198.jpg,0,True
...,...,...,...
202319,202320.jpg,0,True
202339,202340.jpg,0,True
202346,202347.jpg,0,True
202356,202357.jpg,0,True


In [9]:
attribute_list_df.reset_index(drop=True, inplace=True)
attribute_list_df

Unnamed: 0,filename,Gender,isExist
0,000051.jpg,1,True
1,000052.jpg,1,True
2,000065.jpg,1,True
3,000166.jpg,1,True
4,000198.jpg,0,True
...,...,...,...
4995,202320.jpg,0,True
4996,202340.jpg,0,True
4997,202347.jpg,0,True
4998,202357.jpg,0,True


In [25]:
images_path = 'Images'
splitted_folder = 'Splitted'
train_folder = os.path.join(splitted_folder, 'Train')
validation_folder = os.path.join(splitted_folder, 'Validation')
test_folder = os.path.join(splitted_folder, 'Test')

In [12]:
os.makedirs(train_folder, exist_ok=True)
os.makedirs(validation_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

In [29]:
for index, row in attribute_list_df.iterrows():
    filename = row['filename']
    gender_label = row['Gender']

    if index % 10 < 8:
        splitted_folder = train_folder
    elif index % 10 == 8:
        splitted_folder = validation_folder
    else:
        splitted_folder = test_folder

    if gender_label == 0:
        gender_folder = os.path.join(splitted_folder, 'Female')
    else:
        gender_folder = os.path.join(splitted_folder, 'Male')

    os.makedirs(gender_folder, exist_ok=True)

    source_path = os.path.join(images_path, filename)
    destination_path = os.path.join(gender_folder, filename)
    
    print(f'Copying file {source_path} to {gender_folder}')
    shutil.copy(source_path, destination_path)

print('Splitting Datasets completed')

Copying file Images\000051.jpg to Splitted\Train\Male
Copying file Images\000052.jpg to Splitted\Train\Male
Copying file Images\000065.jpg to Splitted\Train\Male
Copying file Images\000166.jpg to Splitted\Train\Male
Copying file Images\000198.jpg to Splitted\Train\Female
Copying file Images\000201.jpg to Splitted\Train\Female
Copying file Images\000240.jpg to Splitted\Train\Female
Copying file Images\000282.jpg to Splitted\Train\Male
Copying file Images\000352.jpg to Splitted\Validation\Male
Copying file Images\000409.jpg to Splitted\Test\Male
Copying file Images\000414.jpg to Splitted\Train\Male
Copying file Images\000439.jpg to Splitted\Train\Male
Copying file Images\000444.jpg to Splitted\Train\Male
Copying file Images\000474.jpg to Splitted\Train\Male
Copying file Images\000525.jpg to Splitted\Train\Female
Copying file Images\000545.jpg to Splitted\Train\Male
Copying file Images\000559.jpg to Splitted\Train\Male
Copying file Images\000572.jpg to Splitted\Train\Male
Copying file Ima

## Preprocessing

In [30]:
image_size = (120, 120)
batch_size = 32

### Train Datasets

In [36]:
train_datasets = image_dataset_from_directory(
    directory=train_folder,
    seed=random.randint(1, 1000),
    class_names=os.listdir(train_folder),
    image_size=image_size,
    batch_size=batch_size
)

Found 4000 files belonging to 2 classes.


### Validation Datasets

In [37]:
validation_datasets = image_dataset_from_directory(
    directory=validation_folder,
    seed=random.randint(1, 1000),
    class_names=os.listdir(validation_folder),
    image_size=image_size,
    batch_size=batch_size
)

Found 500 files belonging to 2 classes.


### Test Datasets

In [38]:
test_datasets = image_dataset_from_directory(
    directory=test_folder,
    seed=random.randint(1, 1000),
    class_names=os.listdir(test_folder),
    image_size=image_size,
    batch_size=batch_size
)

Found 500 files belonging to 2 classes.
