# Data Preparation


In [12]:
# IMPORTS 
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Load in Data

In [13]:
labels = ['Alex', 'Kelly']

def create_labeled_df(main_directory="datasets"):
    file_paths = []
    target_variable = []

    for directory in os.listdir(main_directory):
        full_path = os.path.join(main_directory, directory)

        if directory not in labels:
            continue

        for path in os.listdir(full_path):
            image_path = os.path.join(full_path, path)
            if os.path.isfile(image_path):
                target_variable.append(directory)
                file_paths.append(image_path)

    df = pd.DataFrame({
        "Path": file_paths,
        "Target": target_variable
    })

    csv_filename = "labelled_images.csv"
    df.to_csv(csv_filename, index=False)
    return df

labelled_df = create_labeled_df() # call function
labelled_df.head()


Unnamed: 0,Path,Target
0,datasets/Alex/Alex-Image119.png,Alex
1,datasets/Alex/Alex-Image131.png,Alex
2,datasets/Alex/Alex-Image125.png,Alex
3,datasets/Alex/Alex-Image247.png,Alex
4,datasets/Alex/Alex-Image27.png,Alex


In [14]:
labels = ['TestSet01', 'TestSet02']

def create_unlabelled_df(main_directory="datasets"):
        file_paths = []

        for directory in os.listdir(main_directory):
            full_path = os.path.join(main_directory, directory)

            if directory not in labels:
                continue

            for path in os.listdir(full_path):
                image_path = os.path.join(full_path, path)
                if os.path.isfile(image_path):
                     file_paths.append(image_path)


        df = pd.DataFrame({
            "Path": file_paths
        })

        csv_filename = "unlabelled_images.csv"
        df.to_csv(csv_filename, index=False)
        return df

unlabelled_df = create_unlabelled_df()
unlabelled_df.head()

Unnamed: 0,Path
0,datasets/TestSet02/TestSet02-Image06.png
1,datasets/TestSet02/TestSet02-Image12.png
2,datasets/TestSet02/TestSet02-Image13.png
3,datasets/TestSet02/TestSet02-Image07.png
4,datasets/TestSet02/TestSet02-Image11.png


# Merge Images with Features for Training & Testing

In [15]:
# READ IN CSV'S
labelled_df = pd.read_csv('labelled_images.csv') # path & target
features = pd.read_csv('features.csv') # image name & features

# rename columns
column_mapping = {
    'image_name': 'Image_Name',
    'People(y/n)': 'Has_People',
    'People hm': 'People_Count',
    'Building(y/n)': 'Has_Building',
    'Building hm': 'Building_Count',
    'Animal': 'Animal_Type',
    'type of an': 'Animal_Subtype',
    'Landscape/Nature?': 'Is_Landscape_Nature',
    'Board Games': 'Has_BoardGames',
    'Road/Pathway': 'Has_Road_Pathway',
    'Light/Dark?': 'Is_Light_Dark',
    'Close/Far': 'Is_Close_Far',
    'Straight on/not': 'Is_Straight_On',
    'Horizon (0=none, 1=horizon)': 'Has_Horizon',
    'Orientation (0=horiz, 1=vert)': 'Orientation',
    'Sunrise/Sunset': 'Is_Sunrise_Sunset',
    'Tent/Campfire': 'Has_Tent_Campfire',
    'Food': 'Has_Food'
}
features = features.rename(columns=column_mapping)

features.head()

Unnamed: 0,Image_Name,Has_People,People_Count,Has_Building,Building_Count,Animal_Type,Animal_Subtype,Is_Landscape_Nature,Has_BoardGames,Has_Road_Pathway,Is_Light_Dark,Is_Close_Far,Is_Straight_On,Has_Horizon,Orientation,Is_Sunrise_Sunset,Has_Tent_Campfire,Has_Food
0,Alex-Image01.png,1.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Alex-Image02.png,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Alex-Image03.png,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Alex-Image04.png,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,Alex-Image05.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [25]:
# MERGE DF'S
labelled_df['Image_Name'] = labelled_df['Path'].apply(lambda x: x.split("/")[-1]) # extract img names from path
images_df = pd.merge(labelled_df, features, on='Image_Name', how='inner')
images_df.drop(columns=['Path', 'image_name'], inplace=True) # actually maybe keep img name so can ref later?

# SAVE FULL DATASET AS CSV
images_df.to_csv('images.csv')

images_df.head()

Unnamed: 0,Target,Image_Name,Has_People,People_Count,Has_Building,Building_Count,Animal_Type,Animal_Subtype,Is_Landscape_Nature,Has_BoardGames,Has_Road_Pathway,Is_Light_Dark,Is_Close_Far,Is_Straight_On,Has_Horizon,Orientation,Is_Sunrise_Sunset,Has_Tent_Campfire,Has_Food
0,Alex,Alex-Image119.png,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Alex,Alex-Image131.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alex,Alex-Image125.png,0.0,0.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,Alex,Alex-Image247.png,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,Alex,Alex-Image27.png,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
# images_df = pd.read_csv('images_csv')
# images_train, images_test = train_test_split(images_df, test_size=0.2, random_state=42)

# images_train.to_csv('images_train.csv', index=False)
# images_test.to_csv('images_test.csv', index=False)

# ACTUALLY JUST RUN train_test_split on labelled_images dataset