# Age Classification Project

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

In [2]:
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [4]:
from skimage.feature import canny  # for extracting the canny features of the image

##### The data is present in the folder combined_faces with each image of size 200x200 pixels and named after the images age.

In [6]:
# accessing all images file names

combined_faces_path = "combined_faces"
combined_faces_image_names = os.listdir(combined_faces_path)

In [7]:
len(combined_faces_image_names)

33486

In [8]:
# defining a function to return class labels corresponding to age ranges.

def class_labels(age):
    if 1 <= age <= 2:
        return 0
    elif 3 <= age <= 9:
        return 1
    elif 10 <= age <= 20:
        return 2
    elif 21 <= age <= 25:
        return 3
    elif 26 <= age <= 27:
        return 4
    elif 28 <= age <= 31:
        return 5
    elif 32 <= age <= 36:
        return 6
    elif 37 <= age <= 45:
        return 7
    elif 46 <= age <= 54:
        return 8
    elif 55 <= age <= 65:
        return 9
    else:
        return 10

In [14]:
# creating a dataframe consisting of filenames and corresponding ages and classes.

master_df = pd.DataFrame()
master_df['filename'] = combined_faces_image_names
master_df['age'] = master_df['filename'].map(lambda img_name : int(img_name.split("_")[0]))
master_df['target'] = master_df['age'].map(class_labels)

master_df.head()

Unnamed: 0,filename,age,target
0,100_1.jpg,100,10
1,100_10.jpg,100,10
2,100_11.jpg,100,10
3,100_12.jpg,100,10
4,100_13.jpg,100,10


In [15]:
# shuffling the rows of master_df to mix the dataset

master_df = shuffle(master_df, random_state=11).reset_index(drop=True)
master_df.head()

Unnamed: 0,filename,age,target
0,43_174.jpg,43,7
1,37_21.jpg,37,7
2,1_1765.jpg,1,0
3,38_403.jpg,38,7
4,42_127.jpg,42,7


In [16]:
# seperating features and targets

X = master_df[['filename', 'age']]
Y = master_df['target']

In [17]:
# splitting the dataset into training and testing

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [19]:
print(x_train.shape)
print(x_test.shape)

(26788, 2)
(6698, 2)


In [20]:
# checking the distribution in all the classes for train data

y_train.value_counts(normalize=True)

3     0.104450
7     0.097282
4     0.095491
0     0.094520
6     0.093400
2     0.093101
5     0.091086
1     0.084217
9     0.083097
8     0.082686
10    0.080670
Name: target, dtype: float64

In [22]:
# checking the distribution in all the classes for test data

y_test.value_counts(normalize=True)

3     0.100926
0     0.098537
4     0.098388
2     0.095850
5     0.093013
7     0.089728
8     0.087638
6     0.087190
9     0.085100
1     0.083607
10    0.080024
Name: target, dtype: float64

##### Converting the filtered images into scalars to fit them to a ML Classifier.

##### To do this I will break each 200x200 pixels image into sections of 10x10 pixels each and for the 400 resulting section I will calculate the mean and stdevs.

##### This 800 unique scalar features of each image will be used for classification.
##### The images will be converted to canny edge images for extractiong mean and stdevs.

In [24]:
# function to break 200x200 pixels into sections of 10x10 each
# and calculate mean and std
# INPUT: img of 200x200 pixel size
# OUTPUT: features array of means and stds of 400 sections

def features_grid(img):
    features = np.array([], dtype='float')
    section = 1
    
    for y in range(0, img.shape[0], 10):
        for x in range(0, img.shape[1], 10):
            
            # croppint the img
            sec_img = img[y:y+10, x:x+10]
            
            # calc the mean and std
            sec_mean = np.mean(sec_img)
            sec_std = np.std(sec_img)
            
            # appending the features array
            features = np.append(features, [sec_mean, sec_std])
            
        
    return features

In [115]:
# function to loop through images in the dataset and extract the canny edges mean and std values from 10x10 pixel sections of each image

def extract_canny_edges(filename_series):
    
    # array of shape (1, 801) to store 400 canny edges mean values, 400 canny edges stdev values and 1 age value
    all_imgs = np.zeros((1, 801), dtype='float')
    
    progress = 0
    
    for img_name in filename_series:
        
        # Defining a path to the image and reading in the coloured image.
        
        img_path = os.path.join(combined_faces_path, img_name)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        
        # Converting the coloured image to a grayscale image.
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Converting the grayscale image to a canny edges filtered image.
        img = canny(img, sigma=0.9)
        
        # Using features_grid function for extracting the features (mean and stdev values of all 10x10 pixel sections from the image) from the canny edges filtered image.
        img_features = features_grid(img)
        
        # Adding the actual age value (from the image name) into the features array.
        age = int(img_name.split("_")[0])
        img_features = np.append(img_features, age)
        
        img_features = img_features.reshape(1, img_features.shape[0])
        
        
        # Adding the image's features into the all_imgs features array defined above.
        all_imgs = np.append(all_imgs, img_features, axis=0)
        
        # Keeping track of progress and printing relevant statements for the user.
        progress += 1
        if progress % 1000 == 0:
            print(f"Images processed for features extraction: {progress} of {len(filename_series)}")
    
    
    # Getting rid of the first row of zeros created while defining the all_imgs array above.    
    all_imgs = all_imgs[1:]

    return all_imgs

In [31]:
# Extracting the canny edge features from images in the training dataset.

train_imgs = extract_canny_edges(x_train['filename'])

Images processed for features extraction: 1000 of 26788
Images processed for features extraction: 2000 of 26788
Images processed for features extraction: 3000 of 26788
Images processed for features extraction: 4000 of 26788
Images processed for features extraction: 5000 of 26788
Images processed for features extraction: 6000 of 26788
Images processed for features extraction: 7000 of 26788
Images processed for features extraction: 8000 of 26788
Images processed for features extraction: 9000 of 26788
Images processed for features extraction: 10000 of 26788
Images processed for features extraction: 11000 of 26788
Images processed for features extraction: 12000 of 26788
Images processed for features extraction: 13000 of 26788
Images processed for features extraction: 14000 of 26788
Images processed for features extraction: 15000 of 26788
Images processed for features extraction: 16000 of 26788
Images processed for features extraction: 17000 of 26788
Images processed for features extraction

In [34]:
# Extracting the canny edge features from images in the testing dataset.

test_imgs = extract_canny_edges(x_test['filename'])

Images processed for features extraction: 1000 of 6698
Images processed for features extraction: 2000 of 6698
Images processed for features extraction: 3000 of 6698
Images processed for features extraction: 4000 of 6698
Images processed for features extraction: 5000 of 6698
Images processed for features extraction: 6000 of 6698


In [35]:
print(train_imgs.shape)
print(test_imgs.shape)

(26788, 801)
(6698, 801)


In [37]:
# Creating a list of columns names for the features arrays defined above.
# The column names correspond to the sectioned image's mean and stdev values.
# Last column is the age to be converted to target class label in the model later.

feature_names = []
section = 1
    
for y in range(0, 200, 10):
    for x in range(0, 200, 10):
        feature_names.append(f"sec{section}_mean")
        feature_names.append(f"sec{section}_std")
        section += 1

feature_names.append('age')

In [39]:
feature_names[-10:]

['sec396_std',
 'sec397_mean',
 'sec397_std',
 'sec398_mean',
 'sec398_std',
 'sec399_mean',
 'sec399_std',
 'sec400_mean',
 'sec400_std',
 'age']

In [40]:
len(feature_names)

801

## Using Random Forest and GridSearchCV for Classification

In [56]:
# Converting the numpy arrays to pandas dataframe.

train_df = pd.DataFrame(train_imgs, columns=feature_names)
test_df = pd.DataFrame(test_imgs, columns=feature_names)

In [57]:
train_df.head()

Unnamed: 0,sec1_mean,sec1_std,sec2_mean,sec2_std,sec3_mean,sec3_std,sec4_mean,sec4_std,sec5_mean,sec5_std,...,sec396_std,sec397_mean,sec397_std,sec398_mean,sec398_std,sec399_mean,sec399_std,sec400_mean,sec400_std,age
0,0.02,0.14,0.2,0.4,0.15,0.357071,0.1,0.3,0.14,0.346987,...,0.384187,0.09,0.286182,0.15,0.357071,0.0,0.0,0.03,0.170587,32.0
1,0.27,0.443959,0.19,0.392301,0.15,0.357071,0.0,0.0,0.0,0.0,...,0.3,0.14,0.346987,0.11,0.31289,0.1,0.3,0.0,0.0,38.0
2,0.09,0.286182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.11,0.31289,0.06,0.237487,20.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.420833,0.04,0.195959,0.0,0.0,0.09,0.286182,0.0,0.0,67.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0


In [58]:
train_df.dtypes.unique()

array([dtype('float64')], dtype=object)

In [59]:
# Creating a column of target class values using the function defined above.

train_df['target'] = train_df['age'].map(class_labels)
test_df['target'] = test_df['age'].map(class_labels)

In [60]:
train_df.head()

Unnamed: 0,sec1_mean,sec1_std,sec2_mean,sec2_std,sec3_mean,sec3_std,sec4_mean,sec4_std,sec5_mean,sec5_std,...,sec397_mean,sec397_std,sec398_mean,sec398_std,sec399_mean,sec399_std,sec400_mean,sec400_std,age,target
0,0.02,0.14,0.2,0.4,0.15,0.357071,0.1,0.3,0.14,0.346987,...,0.09,0.286182,0.15,0.357071,0.0,0.0,0.03,0.170587,32.0,6
1,0.27,0.443959,0.19,0.392301,0.15,0.357071,0.0,0.0,0.0,0.0,...,0.14,0.346987,0.11,0.31289,0.1,0.3,0.0,0.0,38.0,7
2,0.09,0.286182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.11,0.31289,0.06,0.237487,20.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.04,0.195959,0.0,0.0,0.09,0.286182,0.0,0.0,67.0,10
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,5


In [65]:
# ML model preparation

x_train_rf = train_df.drop(columns=['age', 'target'])
y_train_rf = train_df['target']

x_test_rf = test_df.drop(columns=['age', 'target'])
y_test_rf = test_df['target']

In [63]:
x_train_rf.head()

Unnamed: 0,sec1_mean,sec1_std,sec2_mean,sec2_std,sec3_mean,sec3_std,sec4_mean,sec4_std,sec5_mean,sec5_std,...,sec396_mean,sec396_std,sec397_mean,sec397_std,sec398_mean,sec398_std,sec399_mean,sec399_std,sec400_mean,sec400_std
0,0.02,0.14,0.2,0.4,0.15,0.357071,0.1,0.3,0.14,0.346987,...,0.18,0.384187,0.09,0.286182,0.15,0.357071,0.0,0.0,0.03,0.170587
1,0.27,0.443959,0.19,0.392301,0.15,0.357071,0.0,0.0,0.0,0.0,...,0.1,0.3,0.14,0.346987,0.11,0.31289,0.1,0.3,0.0,0.0
2,0.09,0.286182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.31289,0.06,0.237487
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.23,0.420833,0.04,0.195959,0.0,0.0,0.09,0.286182,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
# Checking the distribution of classes in y_train.

y_train_rf.value_counts()

3     2798
7     2606
4     2558
0     2532
6     2502
2     2494
5     2440
1     2256
9     2226
8     2215
10    2161
Name: target, dtype: int64

In [76]:
# Checking the distribution of classes to ensure it is same as y_test.

y_train_rf.value_counts(normalize=True)

3     0.104450
7     0.097282
4     0.095491
0     0.094520
6     0.093400
2     0.093101
5     0.091086
1     0.084217
9     0.083097
8     0.082686
10    0.080670
Name: target, dtype: float64

In [68]:
# Checking the distribution of classes to ensure it is same as y_train.

y_test_rf.value_counts(normalize=True)

3     0.100926
0     0.098537
4     0.098388
2     0.095850
5     0.093013
7     0.089728
8     0.087638
6     0.087190
9     0.085100
1     0.083607
10    0.080024
Name: target, dtype: float64

In [70]:
# Scaling X_train to the standard scale.
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
x_train_sc = ss.fit_transform(x_train_rf)

In [71]:
# Transforming X_test to the same scale.

x_test_sc = ss.transform(x_test_rf)

### Training the model using GridSearchCV and RandomForestClassifier

In [72]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [73]:
# Creating a RandomForestClassifier object.

rfc = RandomForestClassifier(# n_estimators=200,
                             # max_depth=5,
                             # ccp_alpha=0,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             random_state=17
                            )

In [74]:
# Establishing ranges of hyperparameters of RandomForestClassifier for GridSearchCV.

rfc_params = {'n_estimators' : [100, 200, 300],
              'max_depth' : [7, 9, 11],
              'ccp_alpha' : [0, 0.001, 0.01],
              # 'min_samples_split' : [2, 5, 10, 15, 20],
              # 'min_samples_leaf' : [2, 3, 4, 5, 6]
             }

In [75]:
# Creating a GridSearchCV object for the RandomForestClassifier object defined above.

rfc_gs = GridSearchCV(rfc, param_grid=rfc_params, n_jobs=-1, cv=5)

In [77]:
# Fitting X_train_sc and y_train on GridSearchCV object with RandomForestClassifier defined above.

rfc_gs.fit(x_train_sc, y_train_rf)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(ccp_alpha=0, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [5, 7, 9],
                         'n_estimators': [50, 100, 200]})

In [78]:
# Best combination of hyperparameters suggested by GridSearchCV.

rfc_gs.best_params_

{'max_depth': 9, 'n_estimators': 200}

In [79]:
# Best accuracy score obtained by the above combination of hyperparameters.

rfc_gs.best_score_

0.39271336746658153

In [80]:
# Scoring the model on training dataset.
# Training Accuracy

rfc_train_acc = rfc_gs.score(x_train_sc, y_train_rf)
rfc_train_acc

0.6364790204569211

In [81]:
# Actual Testing Accuracy

rfc_test_acc = rfc_gs.score(x_test_sc, y_test_rf)
rfc_test_acc

0.40848014332636606

In [82]:
# Summary scores from GridSearchCV with RandomForestClassifier.

print("RandomForestClassifier summary of accuracy scores:")
print(f"GridSearchCV best accuracy (cv=5) = {round(rfc_gs.best_score_, 3)}")
print("\nUsing GridSearchCV best params suggested,")
print(f"Training accuracy = {round(rfc_train_acc, 3)}")
# print(f"Est. Test accuracy (cv=5) = {round(rfc_est_test_acc , 3)}")
print(f"Testing accuracy = {round(rfc_test_acc, 3)}")

RandomForestClassifier summary of accuracy scores:
GridSearchCV best accuracy (cv=5) = 0.393

Using GridSearchCV best params suggested,
Training accuracy = 0.636
Testing accuracy = 0.408


In [83]:
# Generating predictions on testing dataset using the model above.

rfc_pred = rfc_gs.predict(x_test_sc)

In [89]:
print(len(rfc_pred))
rfc_pred

array([8, 8, 7, ..., 8, 3, 0], dtype=int64)

In [85]:
len(y_test_rf)

6698

In [88]:
# Generating a confusion matrix based on above predictions.
from sklearn.metrics import confusion_matrix

conf_mat_rfc = confusion_matrix(y_test_rf, rfc_pred)
conf_mat_rfc

array([[602,   8,   5,  23,  10,   0,   1,   7,   0,   2,   2],
       [123, 270,  40,  63,  10,   0,   0,  36,   5,   6,   7],
       [ 53,  38, 286, 152,  21,   1,   4,  48,  15,   9,  15],
       [ 25,  24,  33, 433,  69,   2,  14,  53,  10,  10,   3],
       [ 33,  11,  19, 293, 137,   1,  14, 120,  11,   8,  12],
       [ 21,  25,  26, 253,  67,  21,  18, 138,  24,  10,  20],
       [ 23,  12,  22, 194,  55,   4,  42, 158,  39,  24,  11],
       [ 20,  13,  18, 123,  59,   3,   7, 250,  52,  25,  31],
       [ 14,   8,  28,  71,  34,   1,   8, 152, 184,  32,  55],
       [ 15,   8,  16,  70,  28,   1,   6, 109,  48, 167, 102],
       [ 10,   6,  21,  27,  18,   1,   1,  70,  24,  14, 344]],
      dtype=int64)

# To check the predictions on a random set of images

In [90]:
# features for predicting the class of unknown img

feature_pred = []
section = 1
    
for y in range(0, 200, 10):
    for x in range(0, 200, 10):
        feature_pred.append(f"sec{section}_mean")
        feature_pred.append(f"sec{section}_std")
        section += 1


In [108]:
# function to loop through images in the dataset and extract the canny edges mean and std values from 10x10 pixel sections of each image

def extract_canny_edges_pred(filename_series):
    
    # array of shape (1, 801) to store 400 canny edges mean values, 400 canny edges stdev values
    all_imgs = np.zeros((1, 800), dtype='float') 
    
    progress = 0
    
    for img_name in filename_series:
        
        # Defining a path to the image and reading in the coloured image.
        
        img = cv2.imread(img_name)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        
        # Converting the coloured image to a grayscale image.
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Converting the grayscale image to a canny edges filtered image.
        img = canny(img, sigma=0.9)
        
        # Using features_grid function for extracting the features from the canny edges filtered image.
        img_features = features_grid(img)
        
        
        img_features = img_features.reshape(1, img_features.shape[0])
        
        
        # Adding the image's features into the all_imgs features array defined above.
        all_imgs = np.append(all_imgs, img_features, axis=0)
        
        # Keeping track of progress and printing relevant statements for the user.
        progress += 1
        if progress % 5 == 0:
            print(f"Images processed for features extraction: {progress} of {len(filename_series)}")
    
    
    # Getting rid of the first row of zeros created while defining the all_imgs array above.    
    all_imgs = all_imgs[1:]

    return all_imgs

In [109]:
pred_imgs = ['images.jpg', 'images1.jpg', 'images2.jpg']
pred_img_series = pd.DataFrame(pred_imgs)

In [110]:
ce_imgs = extract_canny_edges_pred(pred_img_series[0])

In [111]:
ce_imgs_df = pd.DataFrame(ce_imgs, columns=feature_pred)

In [112]:
ce_imgs_df.head()

Unnamed: 0,sec1_mean,sec1_std,sec2_mean,sec2_std,sec3_mean,sec3_std,sec4_mean,sec4_std,sec5_mean,sec5_std,...,sec396_mean,sec396_std,sec397_mean,sec397_std,sec398_mean,sec398_std,sec399_mean,sec399_std,sec400_mean,sec400_std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.13,0.336303,0.1,0.3,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.286182
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06,0.237487,0.07,0.255147,0.12,0.324962,0.0,0.0,0.0,0.0


In [113]:
img_pred_sc = ss.fit_transform(ce_imgs_df)

In [114]:
predictions = rfc_gs.predict(img_pred_sc)
predictions

array([6, 7, 3], dtype=int64)