In [18]:

import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from utils import preprocess_and_save_image
import torch



In [19]:
df = pd.read_csv('data\\UTKFaceAugmented.csv')

# Split Dataframe to Input and Target Features 
## Chose to be a Regression Task (Target = age)

In [20]:
print(df.columns)
print(df.head())
target = df['age']
features = df.drop(['age', 'age_range'], axis=1)

Index(['Unnamed: 0', 'filename', 'age', 'gender', 'race', 'age_range',
       'num_haircuts_life', 'has_tiktok', 'remembers_disco', 'uses_skincare',
       'max_annual_earnings'],
      dtype='object')
   Unnamed: 0                                filename  age  gender   race  \
0           0  100_0_0_20170112213500903.jpg.chip.jpg  100    male  white   
1           1  100_0_0_20170112215240346.jpg.chip.jpg  100    male  white   
2           2  100_1_0_20170110183726390.jpg.chip.jpg  100  female  white   
3           3  100_1_0_20170112213001988.jpg.chip.jpg  100  female  white   
4           4  100_1_0_20170112213303693.jpg.chip.jpg  100  female  white   

  age_range  num_haircuts_life has_tiktok remembers_disco uses_skincare  \
0   100-119                360         no              no            no   
1   100-119                627         no              no            no   
2   100-119                687         no             yes            no   
3   100-119                710     

# Data Preprocessing for Model 1 - Linear Regression Model #

In [21]:
#Selecting potentially relevant features
cm_features = ['gender', 'race', 'num_haircuts_life', 'has_tiktok', 'remembers_disco', 'uses_skincare', 'max_annual_earnings']
cm_df = df[cm_features]

print(cm_df.head())

   gender   race  num_haircuts_life has_tiktok remembers_disco uses_skincare  \
0    male  white                360         no              no            no   
1    male  white                627         no              no            no   
2  female  white                687         no             yes            no   
3  female  white                710         no              no            no   
4  female  white                614         no              no            no   

   max_annual_earnings  
0         32890.160162  
1         29870.803247  
2         62930.622654  
3         31105.957009  
4         63977.673549  


#### One-Hot Encode Categorical Features

In [22]:
categorical_features = ['gender', 'race', 'has_tiktok', 'remembers_disco', 'uses_skincare']

print('Categorical Features')
for feature in categorical_features:
    print(feature)
    print(cm_df[feature].unique())

Categorical Features
gender
['male' 'female']
race
['white' 'asian' 'black' 'indian' 'other']
has_tiktok
['no' 'yes']
remembers_disco
['no' 'yes']
uses_skincare
['no' 'yes']


#### Convert 2-state classification columns to binary ints

In [23]:
cm_df.loc[:, 'has_tiktok'] = cm_df['has_tiktok'].map({'no': 0, 'yes': 1})
cm_df.loc[:, 'remembers_disco'] = cm_df['remembers_disco'].map({'no': 0, 'yes': 1})
cm_df.loc[:, 'uses_skincare'] = cm_df['uses_skincare'].map({'no': 0, 'yes': 1})

#### Convert the rest of the multi-state categorical columns to one hot encoding 

In [24]:

multi_categorical_columns = ['race', 'gender']
cm_df = pd.get_dummies(cm_df, columns=multi_categorical_columns)
#To maintain snake_case naming convention
cm_df.columns = cm_df.columns.str.replace('-', '_')


#### Ensure all feature datatypes are integers now

In [25]:

for features in cm_df.columns:
    if (cm_df[features].dtype == bool) or (cm_df[features].dtype == object):
        cm_df[features] = cm_df[features].astype(int)

print('\nDataframe after conversion preprocessing and one hot encoding')
print(cm_df.dtypes)



Dataframe after conversion preprocessing and one hot encoding
num_haircuts_life        int64
has_tiktok               int32
remembers_disco          int32
uses_skincare            int32
max_annual_earnings    float64
race_asian               int32
race_black               int32
race_indian              int32
race_other               int32
race_white               int32
gender_female            int32
gender_male              int32
dtype: object


#### Check for missing values in the dataset, incase NaN needs to be dropped or interpolated

In [26]:
print(cm_df.isna().sum())

num_haircuts_life      0
has_tiktok             0
remembers_disco        0
uses_skincare          0
max_annual_earnings    0
race_asian             0
race_black             0
race_indian            0
race_other             0
race_white             0
gender_female          0
gender_male            0
dtype: int64


#### Initialize Bias Column (Y-intercept) for Linear Regression Model

In [27]:
cm_df['bias'] = 1

### Divide Linear Regression Data into Train, Validation, Test Set data 

In [28]:
#Dont stratify because its a regression problem
cm_x_train, cm_x_check, cm_y_train, cm_y_check = train_test_split(cm_df, target, test_size=0.2, random_state=42)
cm_x_val, cm_x_test, cm_y_val, cm_y_test = train_test_split(cm_x_check, cm_y_check, test_size=0.5, random_state=42)


#### Apply Standardization to Continuous Numerical Data (No single feature dominates the learning process due to having a larger scale than others)

In [29]:
scaler = StandardScaler()
columns_to_scale = ['num_haircuts_life', 'max_annual_earnings']
cm_x_train[columns_to_scale] = scaler.fit_transform(cm_x_train[columns_to_scale])
cm_x_val[columns_to_scale] = scaler.transform(cm_x_val[columns_to_scale])
cm_x_test[columns_to_scale] = scaler.transform(cm_x_test[columns_to_scale])

print(cm_x_train.head())



       num_haircuts_life  has_tiktok  remembers_disco  uses_skincare  \
5096           -0.298800           0                0              0   
19586           1.915160           0                0              1   
9835           -0.055341           1                0              0   
13631          -0.245544           1                0              1   
9807           -0.390098           0                0              1   

       max_annual_earnings  race_asian  race_black  race_indian  race_other  \
5096             -0.171502           0           0            0           0   
19586            -0.257435           0           0            1           0   
9835             -0.012558           0           0            0           0   
13631             0.075031           0           0            1           0   
9807             -0.089092           0           0            0           0   

       race_white  gender_female  gender_male  bias  
5096            1              0      

#### Store Train, Val, and Test Sets

In [30]:
#Save the data for use in the regression model
cm_x_train = cm_x_train.to_numpy()
cm_x_test = cm_x_test.to_numpy()
cm_y_train = cm_y_train.to_numpy()
cm_y_test = cm_y_test.to_numpy()
cm_x_val = cm_x_val.to_numpy()
cm_y_val = cm_y_val.to_numpy()

directory = 'tensor_collection'
os.makedirs(directory, exist_ok=True)

torch.save(cm_x_train, directory + '/cm_x_train.pt')
torch.save(cm_x_test, directory + '/cm_x_test.pt')
torch.save(cm_y_train, directory + '/cm_y_train.pt')
torch.save(cm_y_test, directory + '/cm_y_test.pt')
torch.save(cm_x_val, directory + '/cm_x_val.pt')
torch.save(cm_y_val, directory + '/cm_y_val.pt')

# Data Preprocessing for Model 2 - Convolutional Neural Network
### AND
# Data Preprocessing for Model 3 - Multi Modela Neural Network

In [31]:
nn_df = cm_df
nn_df.drop(['bias'], axis=1, inplace=True)
print(nn_df.head())

   num_haircuts_life  has_tiktok  remembers_disco  uses_skincare  \
0                360           0                0              0   
1                627           0                0              0   
2                687           0                1              0   
3                710           0                0              0   
4                614           0                0              0   

   max_annual_earnings  race_asian  race_black  race_indian  race_other  \
0         32890.160162           0           0            0           0   
1         29870.803247           0           0            0           0   
2         62930.622654           0           0            0           0   
3         31105.957009           0           0            0           0   
4         63977.673549           0           0            0           0   

   race_white  gender_female  gender_male  
0           1              0            1  
1           1              0            1  
2       

In [32]:
# Create directory to store images
in_directory = 'data\\images'
out_directory = 'preprocessed_images'
os.makedirs(out_directory, exist_ok=True)

nn_df['filename'] = df['filename'].astype(str)
nn_df.apply(lambda row: preprocess_and_save_image(in_directory + '\\' + row['filename'], out_directory + '\\' + row['filename']), axis=1)

nn_df['filename'] = out_directory + '\\' + nn_df['filename'].astype(str)


print(nn_df.head())

   num_haircuts_life  has_tiktok  remembers_disco  uses_skincare  \
0                360           0                0              0   
1                627           0                0              0   
2                687           0                1              0   
3                710           0                0              0   
4                614           0                0              0   

   max_annual_earnings  race_asian  race_black  race_indian  race_other  \
0         32890.160162           0           0            0           0   
1         29870.803247           0           0            0           0   
2         62930.622654           0           0            0           0   
3         31105.957009           0           0            0           0   
4         63977.673549           0           0            0           0   

   race_white  gender_female  gender_male  \
0           1              0            1   
1           1              0            1   
2    

#### Divide data into train, val, test datasets

In [33]:
nn_x_train, nn_x_check, nn_age_train, nn_age_check = train_test_split(nn_df, target, test_size=0.2, random_state=42)
nn_x_val, nn_x_test, nn_age_val, nn_age_test = train_test_split(nn_x_check, nn_age_check, test_size=0.5, random_state=42)

#### Save the Train, Val, Test Sets

In [34]:
#store the traiin test validation as csv as it stores filename (String)
nn_x_train.to_csv(directory + '/nn_x_train.csv', index=False)
nn_x_test.to_csv(directory + '/nn_x_test.csv', index=False)
nn_x_val.to_csv(directory + '/nn_x_val.csv', index=False)

#convert all age to torch tensors as its only numerical
nn_age_train = nn_age_train.to_numpy()
nn_age_test = nn_age_test.to_numpy()
nn_age_val = nn_age_val.to_numpy()

torch.save(nn_age_train, directory + '/nn_y_train.pt')
torch.save(nn_age_test, directory + '/nn_y_test.pt')
torch.save(nn_age_val, directory + '/nn_y_val.pt')
