In [1]:
'''
This is code for extracting NN features of face image data 
and then fit a linear model to predict social attributes of a face
Available dataset: TWIN, CHICAGO and MIT
Available NN feature: 'caffeNet','vgg16','vggFace' and 'faceSNN'

BY Linjie Li
Please run this code on guru2/neon server
'''
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os

from sklearn.decomposition import PCA as sklearnPCA

# Load image dataset#
Dataset = 'mit' # 'twin', 'chicago' or 'mit', 'zhihu' or 'funnyFace'
if Dataset == 'twin':
    imPath = '../../processing/imageProcessing/paddedImages/'
    ext = '.png'
elif Dataset == 'chicago':
    imPath = '../../ChicagoFaceDataset/CFD Version 2.0/CFD 2.0 Images/'
    ext = 'N.jpg'
elif Dataset == 'mit':
    imPath = '../../MIT2kFaceDataset/2kfaces/'
    ext = '.jpg'
elif Dataset =='funnyFace':
    imPath = '../funnyFace/'
    ext = '.png'
else:
    imPath = '../../../zhihu/'
    ext = '.jpg'
imList = []
for dirpath, dirnames, filenames in os.walk(imPath):
    for filename in [f for f in filenames if f.endswith(ext)]:
        imList.append(os.path.join(dirpath, filename))
imList.sort()
print len(imList)
#print imPath

2222


In [2]:
# Make sure that caffe is on the python path:
homePath = '/raid/linjieli/'
caffe_root = homePath+'caffe/'
pretrained_model_root = homePath+'caffe/'

# run this line one time only!
import sys
caffePython = pretrained_model_root + 'python'
if caffePython not in sys.path:
    sys.path.insert(0, caffePython)


import caffe
# Load mean
mu = np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy')
mu = mu.mean(1).mean(1)  # average over pixels to obtain the mean (BGR) pixel values
print 'mean-subtracted values:', zip('BGR', mu)

# Load the trained net
MODEL = 'vgg16' #'caffeNet','vgg16','vggFace' or 'faceSNN'

saveFigPath = '../Result/'+Dataset+'/'+MODEL
if not os.path.exists(saveFigPath):
    os.makedirs(saveFigPath)
    
if MODEL == 'vgg16':
    MODEL_FILE = caffe_root +'models/VGG16/VGG_ILSVRC_16_layers_deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/VGG16/VGG_ILSVRC_16_layers.caffemodel'
elif MODEL == 'caffeNet':
    MODEL_FILE = caffe_root + 'models/bvlc_reference_caffenet/deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'
elif MODEL == 'vggFace':
    MODEL_FILE = caffe_root + 'models/VGGFACE/VGG_CNN_F_deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/VGGFACE/VGG_CNN_F.caffemodel'
    MEAN_FILE = caffe_root + 'models/VGGFACE/VGG_mean.binaryproto'
else:
    MODEL = 'faceSNN'
    MODEL_FILE = caffe_root +'models/sraonet/siamese_lecun_deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/sraonet/snapshots/sraonet_lecun_gd_sub2_iter_100000.caffemodel'
    
caffe.set_device(1)
caffe.set_mode_gpu()
if not os.path.isfile(PRETRAINED_FILE):
    print("No caffemodel!!!")
elif not os.path.isfile(MODEL_FILE):
    print("No MODEL !!!")
else:
    print "Defining the net!"
    net = caffe.Net(MODEL_FILE,
                PRETRAINED_FILE,
                caffe.TEST)
# input preprocessing: 'data' is the name of the input blob == net.inputs[0]
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
if MODEL != 'faceSNN':
    # subtract the dataset-mean value in each channel
    transformer.set_mean('data', mu)
for layer_name, param in net.params.iteritems():
    print layer_name + '\t' + str(param[0].data.shape), str(param[1].data.shape)
transformer.set_transpose('data', (2,0,1))
# the reference model operates on images in [0,255] range instead of [0,1]
transformer.set_raw_scale('data', 255) 
# the reference model has channels in BGR order instead of RGB
transformer.set_channel_swap('data', (2,1,0))

mean-subtracted values: [('B', 104.0069879317889), ('G', 116.66876761696767), ('R', 122.6789143406786)]
Defining the net!
conv1_1	(64, 3, 3, 3) (64,)
conv1_2	(64, 64, 3, 3) (64,)
conv2_1	(128, 64, 3, 3) (128,)
conv2_2	(128, 128, 3, 3) (128,)
conv3_1	(256, 128, 3, 3) (256,)
conv3_2	(256, 256, 3, 3) (256,)
conv3_3	(256, 256, 3, 3) (256,)
conv4_1	(512, 256, 3, 3) (512,)
conv4_2	(512, 512, 3, 3) (512,)
conv4_3	(512, 512, 3, 3) (512,)
conv5_1	(512, 512, 3, 3) (512,)
conv5_2	(512, 512, 3, 3) (512,)
conv5_3	(512, 512, 3, 3) (512,)
fc6	(4096, 25088) (4096,)
fc7	(4096, 4096) (4096,)
fc8	(1000, 4096) (1000,)


In [3]:
# read in image list 
def readFile(fName):
    text_file = open(fName, "r")
    lines = text_file.read().split('\n')
    text_file.close()
    return lines

plt.rcParams['figure.figsize'] = (10, 10)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

if MODEL == 'vgg16' or MODEL == 'vggFace':
    imgeReshape = [224,224]
    featureLayer = 'conv5_2' 
elif MODEL == 'caffeNet':
    imgeReshape = [227,227]
    featureLayer = 'fc6'
else:
    imgeReshape = [56,46]
    featureLayer = 'fc6'
if 'fc' in featureLayer:
    featureNum = net.params[featureLayer][1].data.shape[0]
else:
    featureNum = net.blobs[featureLayer].data.flatten().shape[0]/net.blobs[featureLayer].data.shape[0]
    
if Dataset == 'twin':
    features = np.zeros([4,len(imList)/4,featureNum])
    perImNum = len(imList)/4
    img_type_num = {}
    img_type_index = {}
    img_type_list = {}
    type_index = 0
else:
    features = np.zeros([len(imList),featureNum])
print featureNum

100352


In [4]:
totalNum = 0

# print len(imList)
for img in imList:
    imgName = os.path.basename(img)
    if imgName.endswith(('.jpg','.png')):
        input_image = caffe.io.load_image(img)
        net.blobs['data'].reshape(1,3,imgeReshape[0],imgeReshape[1])
        net.blobs['data'].data[...] = transformer.preprocess('data', input_image)
        out = net.forward()
        feat = net.blobs[featureLayer].data
        if Dataset =='twin':
            img_type = int(imgName[7:-4])/perImNum
            img_index = int(imgName[7:-4])%perImNum
            #print 'img_type:',img_type
            if img_type in img_type_num.keys():
                img_type_num[img_type] = img_type_num[img_type] + 1
                img_type_list[img_type][img_index] = img
            else:
                img_type_num[img_type] = 0
                img_type_list[img_type] = [None]*perImNum
                img_type_index[img_type] = type_index
                type_index +=1
            #print 'img_type_index:',img_type_index[img_type]
            features[img_type_index[img_type],img_type_num[img_type]] = feat.flatten()
        else:
            # need to be further revised!
            features[totalNum] = feat.flatten()
            #print features[totalNum]
        totalNum +=1
    else:
        print img
#print len(img_type_num)
print totalNum
#print img_type_list


#print featureMat

2222


In [16]:
if Dataset == 'twin':
    featureMat = np.zeros((totalNum,featureNum))
    k = 0
    for i in range(features.shape[0]):
        for j in range(features[i].shape[0]):
            if sum(features[i,j,:])!=0:
                featureMat[k,:] = features[i,j,:]
                k +=1
else:
    featureMat = features
#split train test validation
import random
randomInd = range(featureMat.shape[0])
random.shuffle(randomInd)
print max(randomInd)
featureMat = featureMat[randomInd,:]
nSamples = featureMat.shape[0]
testRatio = 0.2
valiRatio = 0.1
testFeatures = featureMat[:int(nSamples*testRatio),:]
trainFeatures = featureMat[int(nSamples*testRatio):,:]
valiFeatures = featureMat[-int(trainFeatures.shape[0]*valiRatio):,:]
trainFeatures = trainFeatures[:-int(trainFeatures.shape[0]*valiRatio),:]
print valiFeatures.shape, trainFeatures.shape, testFeatures.shape

2221
(177, 100352) (1601, 100352) (444, 100352)


In [25]:
radomImlist = np.asarray(imList)[randomInd]
testIm = radomImlist[:int(nSamples*testRatio)]
trainIm = radomImlist[int(nSamples*testRatio):]
print testIm.shape, trainIm.shape

(444,) (1778,)


In [17]:
if MODEL != 'faceSNN':
    explained_variance = 0.95
else:
    explained_variance = featureNum

sklearn_pca = sklearnPCA(n_components=explained_variance, whiten  = True)
trainfeature_transf = sklearn_pca.fit_transform(trainFeatures)
print 'The number of PCs needed to retain %.3f variance is %d.' \
      % (explained_variance, trainfeature_transf.shape[1])

The number of PCs needed to retain 0.950 variance is 943.


In [27]:
print sklearn_pca.components_.shape
print trainfeature_transf.shape
testfeature_tansf = sklearn_pca.transform(testFeatures)
print testfeature_tansf.shape
valifeature_transf = sklearn_pca.transform(valiFeatures)
#component_std = np.std(sklearn_pca.components_,axis = 0,dtype=np.float32)
# import scipy
# u,s,v = scipy.sparse.linalg.svds(featureMat, k =sklearn_pca.components_.shape[0], which = 'LM')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_trainFeatures.csv', trainfeature_transf, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_testFeatures.csv', testfeature_tansf, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_valiFeatures.csv', valifeature_transf, delimiter=',')
whiten_component = sklearn_pca.components_
#print whiten_component
newMat = trainFeatures.dot(whiten_component.T)
bias = np.mean(newMat,axis = 0)
# print bias
newNewMat = newMat - bias
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_weights.csv', whiten_component, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_biases.csv', bias, delimiter=',')

(943, 100352)
(1601, 943)
(444, 943)


In [49]:
import pandas as pd
socialMeasures = '../Result/mit/socialMeasures.csv'
socialMeasures = pd.read_csv(socialMeasures,index_col = 0)
socialAttr = socialMeasures.columns.tolist()
delElement = ['subage.1', 'submale.1', 'subrace.1','subage', 'submale',\
              'subrace','catch', 'catchAns','catch.1','catchAns.1']
social2Attr = [x for x in socialAttr if x not in delElement]
socialMeasuresClean = socialMeasures.loc[:,social2Attr].as_matrix()
#print socialMeasuresClean
np.savetxt('../Result/mit/socialMeasuresClean.csv', socialMeasuresClean, delimiter=',')
def writeFile(imList, rating,fName):
    text_file = open(fName, "w")
    for i in range(imList.shape[0]):
        d = homePath+'attractiveness_datamining/'+imList[i][6:]+' '+str(rating[i])+'\n'
        text_file.write(d)
    text_file.close()
attr= social2Attr[9]
mean_rating = socialMeasures.loc[:,attr].tolist()
mean_rating = map(float, mean_rating)
mean_rating = np.array(mean_rating)
radomRating = mean_rating[randomInd]
testRating = radomRating[:int(nSamples*testRatio)]
trainRating = radomRating[int(nSamples*testRatio):]
valiRating = radomRating[-int(trainRating.shape[0]*valiRatio):]
trainSubRating = trainRating[:-int(trainRating.shape[0]*valiRatio)]
writeFile(trainIm,trainRating,'../list/'+attr+'_train.txt')
writeFile(testIm,testRating,'../list/'+attr+'_test.txt')
print trainSubRating.shape,testRating.shape,valiRating.shape
# print radomImlist[10],radomRating[10]
print imList[randomInd[10]],mean_rating[randomInd[10]]
# print socialMeasures
# socialMeasures = pd.read_csv(socialMeasures,index_col = 0)

(1601,) (444,) (177,)
../../MIT2kFaceDataset/2kfaces/Google_1_Gerald Sloan_5_oval.jpg 5.133333


In [50]:
modelList = []
optFeaNumList = []
import sys
#print sys.path
# local
# PkgPath = '/Users/Olivialinlin/Documents/Github/attractiveness_datamining/linjieCode/code'
# server
PkgPath = homePath+'attractiveness_datamining/linjieCode/code'

if PkgPath not in sys.path:
    sys.path.insert(0, PkgPath)
#print sys.path
from xVal_train_test import Train_Test

import sklearn
import numpy as np
for attr in social2Attr:
    print attr
    #attr= social2Attr[9]
    mean_rating = socialMeasures.loc[:,attr].tolist()
    mean_rating = map(float, mean_rating)
    mean_rating = np.array(mean_rating)
    radomRating = mean_rating[randomInd]
    testRating = radomRating[:int(nSamples*testRatio)]
    trainRating = radomRating[int(nSamples*testRatio):]
    valiRating = radomRating[-int(trainRating.shape[0]*valiRatio):]
    trainSubRating = trainRating[:-int(trainRating.shape[0]*valiRatio)]
    writeFile(trainIm,trainRating,'../list/'+attr+'_train.txt')
    writeFile(testIm,testRating,'../list/'+attr+'_test.txt')
    baseLine = mean_rating.mean()
    print 'mean rating: ', baseLine

    predictionModel = sklearn.linear_model.RidgeCV(alphas=np.logspace(-3,2,num=20), fit_intercept=True)
    myModel,optFeaNum = Train_Test(trainSubRating,testRating,valiRating,trainfeature_transf, \
                                   testfeature_tansf,valifeature_transf,xVal = True,numTrain = 1,\
                                   pModel = predictionModel,getMaxMin = False,MODEL= MODEL, \
                                   plotPredActual = False,returnModel = True)
    modelList.append(myModel)
    optFeaNumList.append(optFeaNum)

atypical
mean rating:  4.08832261206
Correlation:  0.491820897991
num of features:  178
Spearman Correlation:  0.481925443067
num of features:  526
R^2 score:  0.238020620489
num of features:  178
MSE:  0.386662646019
num of features:  178
**************************Result of train and test**************************************
number of features: 178
On test set:
Residual sum of squares: 0.38
Variance score is: 0.20
Correlation between predicted ratings and actual ratings is: 0.4524
Spearman Correlation between predicted ratings and actual ratings is: 0.4643
 
On training set:
Residual sum of squares: 0.32
Variance score is: 0.30
Correlation between predicted ratings and actual ratings is: 0.5468
Spearman Correlation between predicted ratings and actual ratings is: 0.5233
****************************************************************************************
boring
mean rating:  4.29054390414
Correlation:  0.584703553185
num of features:  84
Spearman Correlation:  0.568280633942
num o

In [51]:
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'optNumF.csv', np.asarray(optFeaNumList), delimiter=',', fmt='%d')
print optFeaNumList

[178, 84, 134, 155, 114, 258, 223, 99, 64, 123, 308, 146, 128, 120, 443, 323, 64, 178, 517, 123, 405, 393, 514, 243, 146, 81, 226, 246, 617, 181, 40, 532, 278, 78, 514, 37, 387, 67, 326, 402]


In [34]:
predictRatingAll = np.zeros(socialMeasuresClean.shape)
for i in range(len(modelList)):
    m = modelList[i]
    num = optFeaNumList[i]
    featureOpt = feature_transf[:,:num]
    #print featureOpt.shape
    predictRatingAll[:,i] = m.predict(featureOpt)
np.savetxt('/home/lli-ms/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           '_predict_ratings.csv', predictRatingAll, delimiter=',')
correlationAll = np.corrcoef(predictRatingAll.T)
print social2Attr
np.savetxt('predictCorrelation.csv', correlationAll, delimiter=',')

['atypical', 'boring', 'calm', 'cold', 'common', 'confident', 'egotistic', 'emotUnstable', 'forgettable', 'intelligent', 'introverted', 'kind', 'responsible', 'trustworthy', 'unattractive', 'unemotional', 'unfamiliar', 'unfriendly', 'unhappy', 'weird', 'aggressive', 'attractive', 'caring', 'emotStable', 'emotional', 'familiar', 'friendly', 'happy', 'humble', 'interesting', 'irresponsible', 'mean', 'memorable', 'normal', 'sociable', 'typical', 'uncertain', 'uncommon', 'unintelligent', 'untrustworthy']


In [35]:
%matplotlib inline
from plotFunc import plotHeatMap
import pandas as pd
p = pd.read_csv('./correlation_array', index_col = 0)
column_name = p.columns.tolist()
data = p.as_matrix()
print column_name == social2Attr
# print column_name
# plotHeatMap(data,clusterNum= 15,xTickLabel=column_name,\
#             colorMapName='coolwarm',figName = '',fSize = 3.5\
#             ,dendro = False)

True


In [49]:

((data-data.mean())**2).sum()

513.88521410103363

In [52]:
np.dot(data-data.mean(),(data-data.mean()).T).sum()

8.6422029834416172

In [53]:
data.std()

0.566725911542031