In [144]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [145]:
df = pd.read_csv('data\\UTKFaceAugmented.csv')

In [146]:
print(df.columns)
print(df.head())
target = df['age']

Index(['Unnamed: 0', 'filename', 'age', 'gender', 'race', 'age_range',
       'num_haircuts_life', 'has_tiktok', 'remembers_disco', 'uses_skincare',
       'max_annual_earnings'],
      dtype='object')
   Unnamed: 0                                filename  age  gender   race  \
0           0  100_0_0_20170112213500903.jpg.chip.jpg  100    male  white   
1           1  100_0_0_20170112215240346.jpg.chip.jpg  100    male  white   
2           2  100_1_0_20170110183726390.jpg.chip.jpg  100  female  white   
3           3  100_1_0_20170112213001988.jpg.chip.jpg  100  female  white   
4           4  100_1_0_20170112213303693.jpg.chip.jpg  100  female  white   

  age_range  num_haircuts_life has_tiktok remembers_disco uses_skincare  \
0   100-119                360         no              no            no   
1   100-119                627         no              no            no   
2   100-119                687         no             yes            no   
3   100-119                710     

In [147]:
cm_features = ['gender', 'race', 'age_range', 'num_haircuts_life', 'has_tiktok', 'remembers_disco', 'uses_skincare', 'max_annual_earnings']
cm_df = df[cm_features]

In [148]:
print(cm_df.head())

   gender   race age_range  num_haircuts_life has_tiktok remembers_disco  \
0    male  white   100-119                360         no              no   
1    male  white   100-119                627         no              no   
2  female  white   100-119                687         no             yes   
3  female  white   100-119                710         no              no   
4  female  white   100-119                614         no              no   

  uses_skincare  max_annual_earnings  
0            no         32890.160162  
1            no         29870.803247  
2            no         62930.622654  
3            no         31105.957009  
4            no         63977.673549  


In [149]:
categorical_features = ['gender', 'race', 'age_range', 'has_tiktok', 'remembers_disco', 'uses_skincare']
for feature in categorical_features:
    print(feature)
    print(cm_df[feature].unique())

print(cm_df.dtypes)

#convert 2 state classification columns to binary
cm_df.loc[:, 'has_tiktok'] = cm_df['has_tiktok'].map({'no': 0, 'yes': 1})
cm_df.loc[:, 'remembers_disco'] = cm_df['remembers_disco'].map({'no': 0, 'yes': 1})
cm_df.loc[:, 'uses_skincare'] = cm_df['uses_skincare'].map({'no': 0, 'yes': 1})

#convert the rest of the categorical columns to one hot encoding
multi_categorical_columns = ['race', 'age_range', 'gender']
cm_df = pd.get_dummies(cm_df, columns=multi_categorical_columns)
cm_df.columns = cm_df.columns.str.replace('-', '_')

#convert all bool columns to int automatically
for features in cm_df.columns:
    if (cm_df[features].dtype == bool) or (cm_df[features].dtype == object):
        cm_df[features] = cm_df[features].astype(int)

print(cm_df.dtypes)



gender
['male' 'female']
race
['white' 'asian' 'black' 'indian' 'other']
age_range
['100-119' '0-19' '20-29' '30-39' '40-59' '60-79' '80-99']
has_tiktok
['no' 'yes']
remembers_disco
['no' 'yes']
uses_skincare
['no' 'yes']
gender                  object
race                    object
age_range               object
num_haircuts_life        int64
has_tiktok              object
remembers_disco         object
uses_skincare           object
max_annual_earnings    float64
dtype: object
num_haircuts_life        int64
has_tiktok               int32
remembers_disco          int32
uses_skincare            int32
max_annual_earnings    float64
race_asian               int32
race_black               int32
race_indian              int32
race_other               int32
race_white               int32
age_range_0_19           int32
age_range_100_119        int32
age_range_20_29          int32
age_range_30_39          int32
age_range_40_59          int32
age_range_60_79          int32
age_range_80_99     

In [150]:
print(cm_df.isna().sum())

num_haircuts_life      0
has_tiktok             0
remembers_disco        0
uses_skincare          0
max_annual_earnings    0
race_asian             0
race_black             0
race_indian            0
race_other             0
race_white             0
age_range_0_19         0
age_range_100_119      0
age_range_20_29        0
age_range_30_39        0
age_range_40_59        0
age_range_60_79        0
age_range_80_99        0
gender_female          0
gender_male            0
dtype: int64


In [151]:
#do train and test split
X_train, X_test, y_train, y_test = train_test_split(cm_df, target, test_size=0.2, random_state=42)

#scale the data
scaler = StandardScaler()

#standardize the data that are not one hot encoded
columns_to_scale = ['num_haircuts_life', 'max_annual_earnings']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

print(X_train.head())



       num_haircuts_life  has_tiktok  remembers_disco  uses_skincare  \
5096           -0.298800           0                0              0   
19586           1.915160           0                0              1   
9835           -0.055341           1                0              0   
13631          -0.245544           1                0              1   
9807           -0.390098           0                0              1   

       max_annual_earnings  race_asian  race_black  race_indian  race_other  \
5096             -0.171502           0           0            0           0   
19586            -0.257435           0           0            1           0   
9835             -0.012558           0           0            0           0   
13631             0.075031           0           0            1           0   
9807             -0.089092           0           0            0           0   

       race_white  age_range_0_19  age_range_100_119  age_range_20_29  \
5096            1  