<a href="https://colab.research.google.com/github/benihime91/pytorch_retinanet/blob/master/nbs/01_preprocess_pascal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**setup:**

In [None]:
# Grab the Data
! unzip -qq /content/drive/My\ Drive/Pascal\ 2007\ Data/pascal_voc_2007_test.zip
! unzip -qq /content/drive/My\ Drive/Pascal\ 2007\ Data/pascal_voc_2007_train_val.zip

In [None]:
# Clone the RetinaNet Repo:
! git clone https://github.com/benihime91/pytorch_retinanet.git

**standard imports:**

In [None]:
from typing import *
import pandas as pd
import re
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pytorch_retinanet.src.utils.general_utils import Visualizer, xml_to_csv, ifnone
import os
import sys
import warnings

warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
%load_ext autoreload
%autoreload 2
%matplotlib inline

**specify file locations:**

In [None]:
trn_annot_dir = '/content/pascal_voc_2007_train_val/Annotations'  # location to the xml files for train dataset
trn_img_dir = '/content/pascal_voc_2007_train_val/Images' # location to the images for the train dataset

tst_annot_dir = '/content/pascal_voc_2007_test/Annotations' # location to the xml files for test dataset
tst_img_dir = '/content/pascal_voc_2007_test/Images' # location to the images for the test dataset

**create dataframes:**

In [None]:
# Create pandas DataFrame from the xmls
df_trn = xml_to_csv(trn_annot_dir) # train dataframe
df_tst = xml_to_csv(tst_annot_dir) # test dataframe

In [None]:
df_trn.head()

In [None]:
df_tst.head(5)

**helper fuctions:**

In [None]:
def preprare_data(df:pd.DataFrame, img_dir:str, encoder:LabelEncoder=None):
    "preprocess the given data and returns a pandas dataframe & encoder"
    encoder = ifnone(encoder, LabelEncoder())
    # modify filename to point to the image path
    df["filename"] = [os.path.join(img_dir, idx) for idx in df.filename.values]
    # get labels from the filename
    try:
        df['labels'] = encoder.transform(df['class'].values)
    except:
        df['labels'] = encoder.fit_transform(df['class'].values)
    return df, encoder
    
def create_label_dict(dataframe: pd.DataFrame, encoder: LabelEncoder) -> Dict[int, str]:
    "Creates a label dictionary from the given dataframe `labels`"
    names = list(dataframe.labels.unique())
    names.sort()
    # Create the label dictionary
    label_dict = {
        idx: clas for idx, clas in zip(names, list(encoder.inverse_transform(names)))
    }
    return label_dict

**Pre-process the dataframes:**

In [None]:
df_trn, encoder = preprare_data(df_trn, img_dir=trn_img_dir)
df_trn.head(10)

In [None]:
df_tst, _ = preprare_data(df_tst, img_dir=tst_img_dir, encoder=encoder)
df_tst.head(10)

**Instantiate the label dictionary and save it:**

In [None]:
# Grab the label dictionary
label_dict = create_label_dict(df_trn, encoder)
label_dict

In [None]:
# save the label_dict dictionary
f = open("/content/drive/My Drive/Pascal 2007 Data/names.pkl", "wb")
pickle.dump(label_dict, f)
f.close()

In [None]:
# crosscheck
label_dict = pickle.load(open("/content/drive/My Drive/Pascal 2007 Data/names.pkl", "rb"))
print(label_dict)

**visualize some images from the dataset for sanity check:**

In [None]:
# Instantiate the visualizer
viz = Visualizer(class_names=label_dict)

In [None]:
def display_random_image(df: pd.DataFrame) -> None:
    "displays a radom Image from given dataframe"
    n = np.random.randint(0, len(df))
    fname = df["filename"][n]
    boxes = df.loc[df["filename"] == fname][["xmin", "ymin", "xmax", "ymax"]].values
    labels = df.loc[df["filename"] == fname]["labels"].values
    viz.draw_bboxes(fname, boxes=boxes, classes=labels, figsize=(10, 10))

In [None]:
# Display some random Images from the Dataset for sanity check
display_random_image(df_trn)
display_random_image(df_trn)
display_random_image(df_tst)

**create splits:**

In [None]:
def create_splits(
    df: pd.DataFrame, split_sz: float = 0.3
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    "Split given DataFrame into `split_sz`"
    # Grab the Unique Image Idxs from the Filename
    unique_ids = list(df.filename.unique())
    # Split the Unique Image Idxs into Train & valid Datasets
    train_ids, val_ids = train_test_split(
        unique_ids, shuffle=True, random_state=42, test_size=split_sz
    )

    # Create Splits on the DataFrame
    df["split"] = 0

    for i, idx in enumerate(df.filename.values):
        if idx in set(train_ids):
            df["split"][i] = "train"
        elif idx in set(val_ids):
            df["split"][i] = "val"

    # Split the DataFrame into Train and Valid DataFrames
    df_trn, df_val = df.loc[df["split"] == "train"], df.loc[df["split"] == "val"]
    df_trn, df_val = df_trn.reset_index(drop=True), df_val.reset_index(drop=True)

    # drop the extra redundent column
    df_trn.drop(columns=["split"], inplace=True)
    df_val.drop(columns=["split"], inplace=True)

    return df_trn, df_val

In [None]:
# Split the data into train & validation sets:
df_trn, df_val = create_splits(df_trn)

print('Num examples in train dataset', len(df_trn.filename.unique()))
print('Num examples in train dataset', len(df_val.filename.unique()))
print('Num examples in train dataset', len(df_tst.filename.unique()))

In [None]:
df_trn.head()

In [None]:
df_val.head()

In [None]:
df_tst.head()

**save the dataframes as csv files:**

In [None]:
# Save the DatFrames:
df_trn.to_csv('/content/drive/My Drive/Pascal 2007 Data/trn_data.csv', index=False)
df_val.to_csv('/content/drive/My Drive/Pascal 2007 Data/val_data.csv', index=False)
df_tst.to_csv('/content/drive/My Drive/Pascal 2007 Data/tst_data.csv', index=False)

**sanity check:**

In [None]:
display_random_image(df_trn)
display_random_image(df_trn)

In [None]:
display_random_image(df_val)
display_random_image(df_val)

In [None]:
display_random_image(df_tst)
display_random_image(df_tst)