In [61]:
# Imports
import pandas as pd
import os
import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split




In [62]:
def process_csv(csv_file, output_csv):
    """
        Processes the input CSV file and outputs a pt file containing the processed data and a pt file containing the file names
    """

    # Load CSV into a DataFrame
    data = pd.read_csv(csv_file)

    # Drop specified columns (adjust column indices as needed)
    columns_to_drop = [0, 3, 4]  # Assuming column indexing starts from 0
    processed_data = data.drop(data.columns[columns_to_drop], axis=1)

    # Define binary categorical columns for label encoding
    binary_categorical_columns = ['has_tiktok', 'remembers_disco', 'uses_skincare']

    # Label encode binary categorical columns
    label_encoder = LabelEncoder()
    for col in binary_categorical_columns:
        processed_data[col] = label_encoder.fit_transform(processed_data[col])
    
    # Specify features and target variable
    features = ['filename', 'num_haircuts_life', 'has_tiktok', 'remembers_disco', 'uses_skincare', 'max_annual_earnings']
    target_variable = 'age'

    # Split the data into features and target variable
    x = processed_data[features]
    y = processed_data[target_variable]

    # Split the data into training, validation, and testing sets
    x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=0.6, random_state=0)
    x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, train_size=0.5, random_state=0)
    
    #Save the file names for each set
    x_train_file_names = x_train['filename']
    x_val_file_names = x_val['filename']
    x_test_file_names = x_test['filename']

    # Drop the file names from the dataframes
    x_train = x_train.drop(columns=['filename'])
    x_val = x_val.drop(columns=['filename'])
    x_test = x_test.drop(columns=['filename'])

    # Convert the dataframes to tensors
    x_train, x_val, x_test = torch.Tensor(x_train.to_numpy()), torch.Tensor(x_val.to_numpy()), torch.Tensor(x_test.to_numpy())
    y_train, y_val, y_test = torch.Tensor(y_train.to_numpy()), torch.Tensor(y_val.to_numpy()), torch.Tensor(y_test.to_numpy())

    # Save the data and file names to pt files
    data_dict = {'x_train':x_train, 'x_val':x_val, 'x_test':x_test, 'y_train':y_train, 'y_val':y_val, 'y_test':y_test}
    file_names_dict = {'x_train_file_names':x_train_file_names, 'x_val_file_names':x_val_file_names, 'x_test_file_names':x_test_file_names}

    # Save the data and file names to csv files
    x_train_file_names.to_csv('data/x_train_file_names.csv')
    x_val_file_names.to_csv('data/x_val_file_names.csv')
    x_test_file_names.to_csv('data/x_test_file_names.csv')
    
    # Read the saved data and file names csv in order to drop first column
    x_train_file_names = pd.read_csv('data/x_train_file_names.csv', header=None)
    x_train_file_names.drop(x_train_file_names.columns[0], axis=1, inplace=True)
    x_val_file_names = pd.read_csv('data/x_val_file_names.csv', header=None)
    x_val_file_names.drop(x_val_file_names.columns[0], axis=1, inplace=True)
    x_test_file_names = pd.read_csv('data/x_test_file_names.csv', header=None)
    x_test_file_names.drop(x_test_file_names.columns[0], axis=1, inplace=True)

    # Save the data and file names to pt files
    torch.save(data_dict, 'data/ProcessedData.pt')
    torch.save(file_names_dict, 'data/ProcessedDataFileNames.pt')

    return processed_data

def pre_process_images():
    data = torch.load('data/ProcessedData.pt')
    

if __name__ == "__main__":
    csv_file_path = 'data/UTKFaceAugmented.csv' # Input CSV file
    output_csv_file = 'data/ProcessedData.csv'  # Output CSV file
    image_folder_path = 'data/images'           # Folder containing all images

    # Calls the process functions in order to process our data
    processed_data = process_csv(csv_file_path, output_csv_file)
    pre_process_images()

