# Data Preparation

## Library imports

In [8]:
import zipfile
import pandas as pd
import os

## Functions to extract and convert dataset

In [2]:
def unzip(filepath, extract_to):
    if not zipfile.is_zipfile(filepath):
        print("filepath error")
        return
    with zipfile.ZipFile(filepath, 'r') as zipped:
        zipped.extractall(extract_to)

def convert_folder_to_autogluon_format(path):
    columns = ["image", "label_text"]
    df = pd.DataFrame(columns=columns)
    for root, dirs, files in os.walk(path):
        for file in files:
            file_paths = os.path.abspath(os.path.join(root, file))
            label_text = os.path.basename(os.path.dirname(file_paths))
            new_row = {"image": file_paths, "label_text": label_text}
            df.loc[len(df)] = new_row
    return df

In [3]:
filepath="../../data/raw/ImageClassificationReducedClass.v1i.folder.zip"
extract_to="../../data/intermediate"

unzip(filepath, extract_to)

In [9]:
train_filepath = "../../data/intermediate/train"
train_df = convert_folder_to_autogluon_format(train_filepath)
train_df

Unnamed: 0,image,label_text
0,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
1,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
2,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
3,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
4,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
...,...,...
2644,e:\Current_Workdir\palm-fruit-classification\d...,unripe
2645,e:\Current_Workdir\palm-fruit-classification\d...,unripe
2646,e:\Current_Workdir\palm-fruit-classification\d...,unripe
2647,e:\Current_Workdir\palm-fruit-classification\d...,unripe


In [10]:
test_filepath = "../../data/intermediate/valid"
test_df = convert_folder_to_autogluon_format(test_filepath)
test_df

Unnamed: 0,image,label_text
0,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
1,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
2,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
3,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
4,e:\Current_Workdir\palm-fruit-classification\d...,empty_bunch
...,...,...
374,e:\Current_Workdir\palm-fruit-classification\d...,unripe
375,e:\Current_Workdir\palm-fruit-classification\d...,unripe
376,e:\Current_Workdir\palm-fruit-classification\d...,unripe
377,e:\Current_Workdir\palm-fruit-classification\d...,unripe


## Save data in autogluon format

In [11]:
train_df.to_csv("../../data/clean/train_df.csv", index=False)
test_df.to_csv("../../data/clean/test_df.csv", index=False)