# Milestone 4 EDA  

Goals:
- load structured data 
- load image data

## Exploring structured data

In [None]:
import pandas as pd
import numpy as np

Training data includes filenames of photos along with the correct headcount in "train" dataframe and in the "bbox_train" dataframe there is the filename along with the image size (width, heigth) along with 4 limits representing the bounding box for a face, there is a row for each individual face. 

In [None]:
data_table = pd.read_csv("data/structured data/headcount-table.csv")
print(data_table.info())
data_table.head()

In [None]:
bbox_table = pd.read_csv("data/structured data/bbox_table.csv")
print(bbox_table.info())
print('--------------------------------------')
print("Unique photos: ",len(bbox_table['Name'].unique()))
print('--------------------------------------')
bbox_table.head()

### Plotting head count distribution

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
# g = sns.histplot(data=data_table["HeadCount"])
# g.set(xlim=(0,25))

h = sns.histplot(data=data_table["HeadCount"],color='orange')
h.set(xlim=(0,25))

plt.title("Head count distribution",fontdict={'fontsize':18})
# plt.legend(['Train'])
plt.show()

## Loading and displaying image data

In [None]:
import PIL
from PIL import Image, ImageDraw
import tensorflow as tf
from tensorflow import keras

In [None]:
bbox_table.head()

In [None]:
# display an image
path = "data/train/10002.jpg"
w,h = list(bbox_table.loc[bbox_table["Name"]=='10002.jpg']['width'])[0],list(bbox_table.loc[bbox_table["Name"]=='10002.jpg']['height'])[0]

im = Image.open(path)
im.show()


In [None]:
# display a group of bounding boxes
box_table = bbox_table.loc[bbox_table.Name=="10002.jpg"]
w,h = box_table.width.iloc[0],box_table.height.iloc[0]

with Image.new("RGB",(w,h)) as img: 

    for row in box_table.iterrows():
        shape = [row[1].xmin,row[1].ymin,row[1].xmax,row[1].ymax]
        bbox = ImageDraw.Draw(img)
        bbox.rectangle(shape,outline='red')

    img.show()


In [None]:
# draw bounding boxes on a photo to display
path = "data/train/10002.jpg"
box_table = bbox_table.loc[bbox_table.Name=="10002.jpg"]
w,h = box_table.width.iloc[0],box_table.height.iloc[0]

with Image.open(path) as img: 

    for row in box_table.iterrows():
        shape = [row[1].xmin,row[1].ymin,row[1].xmax,row[1].ymax]
        bbox = ImageDraw.Draw(img)
        bbox.rectangle(shape,outline='red')

    img.show()


In [None]:
# Formalize the above cell as a function

def show_photo(num:int):
    try:
        if len(str(num)) == 5:
            path = f"data/train/{str(num)}.jpg"
            box_table = bbox_table.loc[bbox_table.Name==path[-9:]]
            w,h = box_table.width.iloc[0],box_table.height.iloc[0]
            with Image.open(path) as img: 
                for row in box_table.iterrows():
                    shape = [row[1].xmin,row[1].ymin,row[1].xmax,row[1].ymax]
                    bbox = ImageDraw.Draw(img)
                    bbox.rectangle(shape,outline='red')
            return img.show(),print(path),box_table
        
        else:
            return print("invaled number")
            
    except IndexError:
        print('photo not available in data set')

In [None]:
show_photo(10001)

## organizing dataset subdirectory structure and establishing train-test split

In [None]:
import os
from shutil import move
from sklearn.model_selection import train_test_split

Train test split

In [None]:
train,test = train_test_split(data_table.Name, test_size=0.25,random_state=42)

In [None]:
pre_split = os.listdir("data/parent/")
len(pre_split)

In [None]:
data_table.Nam

In [None]:
parent_dir = pd.Series(os.listdir("data/parent/"))
table = pd.concat([data_table.Name,parent_dir],axis=1)

In [None]:
table

In [None]:
parent_dir = "data/parent/"
train_dir = "data/train/"
test_dir = "data/test/"



In [None]:
classes = data_table.HeadCount.unique()
classes

In [None]:
parent_dir = 'data/image_data/train/'
paths = []
for cat in classes:
    path = os.path.join(parent_dir, str(cat))
    paths.append(path)

ticker = 0 
for path in paths:
    os.mkdir(path)
    ticker += 1

print(ticker)

In [None]:
data_table.head()

In [None]:
source_dir = 'data/image_data/train/'

for row in data_table.iterrows():
    jpg = row[1].Name
    headcount = row[1].HeadCount

    source = f'data/image_data/train/{jpg}'
    destination = f'data/image_data/train/{headcount}/{jpg}'
    
    move(source,destination)
    

## Loading image data with Keras

In [None]:
head_count_labels = list(data_table.HeadCount)
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    directory='data/image_data/train',
    labels='inferred',
    label_mode='int',
    validation_split=0.25,
    subset='training',
    batch_size=32,
    image_size=(408, 612),
    seed=42)

In [None]:
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    directory='data/image_data/train',
    labels='inferred',
    label_mode='int',
    validation_split=0.25,
    subset='validation',
    batch_size=32,
    image_size=(408, 612),
    seed=42)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10, 10))
for images, labels in image_ds.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    # plt.title(class_names[labels[i]])
    plt.axis("off")