In [86]:
import pandas as pd
import os
import numpy as np
import random
from shutil import copy
import math
from keras.preprocessing.image import ImageDataGenerator


In [25]:
input_path = "/home/anisha/MlProject/data/training/"
profileCSV = "profile/profile.csv"

In [26]:
def read_images(attribute):
    #load profile information into a dataframe
    profile_file_path = os.path.join(input_path+profileCSV)
    input = pd.read_csv(profile_file_path, usecols=attribute)
    return(input)

In [50]:
import os
input_image_dataframe = read_images(['userid', 'gender','age'])

In [51]:
def partitionData(dataInput, testPercentage, validationPercentage):
    '''
    slices a numpy array based on the percentage size of each bucket

    :param dataInput: numpy array
    :param testPercentage: 0-1, to range between 0% to 100%
    :param validationPercentage: 0,1, to range between 0% to 100%
    :return: sliced nunmpy array based on the training and validation percentage
    '''

    totalLen = len(dataInput)
    testSlice = slice(0, int(totalLen * testPercentage))
    validationSlice = slice(int(totalLen * testPercentage), int(totalLen * (validationPercentage + testPercentage)))
    trainSlice = slice(int(totalLen * (validationPercentage + testPercentage)), totalLen)
    return (dataInput[testSlice], dataInput[validationSlice], dataInput[trainSlice])

In [41]:
def partition_data(input_df, testPercentage, validationPercentage):
    '''
    splits input dataframe based on the percentage size of each bucket

    :param input: dataframe
    :param testPercentage: 0-1, to range between 0% to 100%
    :param validationPercentage: 0-1, to range between 0% to 100%
    :return: dataframes for test, train and validation
    '''

    # arranging data in indices to split
    all_Ids = np.arange(input_df.shape[0])

    # test and train ids
    random.shuffle(all_Ids)
    (test_Ids, validate_Ids, train_Ids) = partitionData(all_Ids, testPercentage, validationPercentage)

    data_test = input_df.loc[test_Ids, :]
    data_validate = input_df.loc[validate_Ids, :]
    data_train = input_df.loc[train_Ids, :]

    if validationPercentage == 0:
        return (data_test, data_train)
    else:
        return (data_test, data_validate, data_train)


In [45]:
def images_augmentation():
    dest_train_dir = input_path+ "image-source/train"
    dest_validation_dir = input_path + "image-source/validation"
    dest_test_dir = input_path + "image-source/test"

    (image_test_dataframe, image_validate_dataframe, image_train_dataframe) = partition_data(input_image_dataframe, 0.1, 0.1)
    nb_train_samples = image_train_dataframe.shape[0]
    nb_validation_samples = image_validate_dataframe.shape[0]
    print(nb_train_samples)
    print(nb_validation_samples)

    #copy_to_folder(image_test_dataframe, dest_test_dir)
    copy_to_folder(image_validate_dataframe, dest_validation_dir,"validate")
    copy_to_folder(image_train_dataframe, dest_train_dir, "train")

In [53]:

def copy_to_folder(image_dataframe, dest_dir, dirName):
    src_dir = input_path + "image/"
    count_female = 0
    count_male = 0
    for id in image_dataframe.userid:
        #Get the file
        image_file_name = id + ".jpg"
        image_file_path = src_dir+image_file_name
        if image_dataframe.loc[image_dataframe['userid'] == id, 'gender'].iloc[0] == 1.0:
            copy(image_file_path, dest_dir+"/female/")
            count_female+=1
        else:
            copy(image_file_path, dest_dir+"/male/")
            count_male+=1
    print("dir:"+dirName)
    print(str(count_male))
    print(str(count_female))


In [56]:
images_augmentation()

7600
950
dir:validate
402
548
dir:train
3212
4388


In [87]:
input_path = "/home/anisha/MlProject/data/training/"
train_data_dir = "image-source/train"
validation_data_dir = "image-source/validation"
test_data_dir = "image-source/test"


top_model_weights_path = 'bottleneck_gender_model.h5'
image_size = 150
epochs = 10
batch_size = 16
weights_path = 'vgg16_weights.h5'
nb_train_samples = 7600
nb_validation_samples = 950
nb_train_female = 4388
nb_train_male = 3212
nb_validation_female = 548
nb_validation_male = 402

In [93]:
def save_bottleneck_features():
    datagen = ImageDataGenerator(rescale=1. / 255)

    # build the VGG16 network
    model = applications.VGG16(include_top=False, weights='imagenet', input_shape=(image_size, image_size, 3))

    generator = datagen.flow_from_directory(
        input_path+train_data_dir,
        target_size=(image_size, image_size),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)


    print("saving bottleneck for training")
    predict_size_train = int(math.ceil(nb_train_samples / batch_size))
    bottleneck_features_train = model.predict_generator(generator, predict_size_train)
    np.save('bottleneck_features_train.npy', bottleneck_features_train)

    print("saving bottleneck for validation")
    generator = datagen.flow_from_directory(
        input_path+validation_data_dir,
        target_size=(image_size, image_size),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)

    predict_size_validation = int(math.ceil(nb_validation_samples / batch_size))

    bottleneck_features_validation = model.predict_generator(generator, predict_size_validation)

    np.save('bottleneck_features_validation.npy', bottleneck_features_validation)
    print("saved bottleneck for validation")

In [1]:
from keras import applications
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.models import Model
from keras import optimizers
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [95]:
save_bottleneck_features()

Found 7600 images belonging to 2 classes.
saving bottleneck for training


KeyboardInterrupt: 

In [2]:
def train_top_model():
    train_data = np.load('bottleneck_features_train.npy')

    validation_data = np.load('bottleneck_features_validation.npy')

    print(train_data.shape)
    print(validation_data.shape)

    train_labels = np.array(
        [1] * nb_train_female + [0] * nb_train_male)

    validation_labels = np.array(
        [1] * nb_validation_female + [0] * nb_validation_male)

    model = Sequential()
    model.add(Flatten(input_shape=train_data.shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy', metrics=['accuracy'])

    history = model.fit(train_data, train_labels,
              epochs=epochs,
              batch_size=batch_size,
              validation_data=(validation_data, validation_labels))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

    model.save_weights(input_path+top_model_weights_path)
    return model

In [73]:
train_top_model()

(7600, 2, 2, 512)
(950, 2, 2, 512)
Train on 7600 samples, validate on 950 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.models.Sequential at 0x7f0cefe87390>

In [84]:
def finetune_model():
    # build the VGG16 network
    base_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(64, 64, 3))
    print('Model loaded.')

    # build a classifier model to put on top of the convolutional model
    top_model = Sequential()
    top_model.add(Flatten(input_shape=base_model.output_shape[1:]))
    top_model.add(Dense(256, activation='relu'))
    top_model.add(Dropout(0.5))
    top_model.add(Dense(1, activation='sigmoid'))

    # note that it is necessary to start with a fully-trained
    # classifier, including the top classifier,
    # in order to successfully do fine-tuning
    top_model.load_weights(input_path+top_model_weights_path)

    # add the model on top of the convolutional base
    model = Model(input=base_model.input, output=top_model(base_model.output))

    print(model.summary())

    # set the first 25 layers (up to the last conv block)
    # to non-trainable (weights will not be updated)
    for layer in model.layers[:25]:
        layer.trainable = False

    # compile the model with a SGD/momentum optimizer
    # and a very slow learning rate.
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
                  metrics=['accuracy'])

    # prepare data augmentation configuration
    train_datagen = ImageDataGenerator(
        rescale=1. / 255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

    test_datagen = ImageDataGenerator(rescale=1. / 255)

    train_generator = train_datagen.flow_from_directory(
        input_path+train_data_dir,
        target_size=(image_size, image_size),
        batch_size=batch_size,
        class_mode='binary')

    validation_generator = test_datagen.flow_from_directory(
        input_path+validation_data_dir,
        target_size=(image_size, image_size),
        batch_size=batch_size,
        class_mode='binary')

    # fine-tune the model
    history = model.fit_generator(
        train_generator,
        steps_per_epoch=nb_train_samples,
        epochs=epochs,
        validation_data=validation_generator,
        validation_steps=nb_validation_samples)
    
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()


    return model


In [85]:
model = finetune_model()

Model loaded.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 64, 64, 3)         0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 64, 64, 64)        1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 64, 64, 64)        36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 32, 32, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 32, 32, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 32, 32, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 16, 16, 128)       0      



Found 950 images belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 