In [5]:
'''
This is code for extracting NN features of face image data 
and then fit a linear model to predict social attributes of a face
Available dataset: TWIN, CHICAGO and MIT
Available NN feature: 'caffeNet','vgg16','vggFace' and 'faceSNN'

BY Linjie Li
Please run this code on guru2/neon server
'''
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd

from sklearn.decomposition import PCA as sklearnPCA

# Load image dataset#
Dataset = 'mit' # 'twin', 'chicago' or 'mit', 'zhihu' or 'funnyFace'
if Dataset == 'mit':
    imPath = '../../MIT2kFaceDataset/2kfaces/'
    ext = '.jpg'
imList = []
for dirpath, dirnames, filenames in os.walk(imPath):
    for filename in [f for f in filenames if f.endswith(ext)]:
        imList.append(os.path.join(dirpath, filename))
imList.sort()
print len(imList)
#print imPath
df = pd.read_csv('../../MIT2kFaceDataset/clean_data/geometric_wSmoothness.csv',index_col = 0)

print len(df.index)
imList = [imList[i] for i in range(len(imList)) if imList[i][31:] in df.index]
remainIndex = [i for i in range(len(imList)) if imList[i][31:] in df.index]
print len(remainIndex)
landMarks = pd.read_csv('../../processing/landmarking/mit/mitLandmarks.csv',index_col = 0)
imageList = list(set(landMarks.index.tolist()))
imageList.sort()
config_feature = df.as_matrix()
print config_feature.shape

2222
2207
2207
(2207, 37)


In [6]:
X_LMmatrix = np.zeros((len(imList),68))
Y_LMmatrix = np.zeros((len(imList),68))
for im in imageList:
    ind_im = imageList.index(im)
    X_LMmatrix[ind_im,:] = landMarks.ix[im].x.tolist()
    Y_LMmatrix[ind_im,:] = landMarks.ix[im].y.tolist()
def computeDist(X_LMmatrix,Y_LMmatrix, imInd):
    pointNum = X_LMmatrix.shape[1]
    distArr = np.zeros((1,pointNum*(pointNum-1)/2))
    distInd = 0
    for i in range(pointNum):
        for j in range(i+1,pointNum):
            x1 = np.asarray([X_LMmatrix[imInd,i],Y_LMmatrix[imInd,i]])
            x2 = np.asarray([X_LMmatrix[imInd,j],Y_LMmatrix[imInd,j]])
            distArr[0,distInd] = np.linalg.norm(x1-x2)
            distInd +=1
    return distArr
import sys
def computeCosine(X_LMmatrix,Y_LMmatrix, imInd):
    pointNum = X_LMmatrix.shape[1]
    cosineArr = np.zeros((1,pointNum*(pointNum-1)/2))
    cosineInd = 0
    for i in range(pointNum):
        for j in range(i+1,pointNum):
            denominator = float(X_LMmatrix[imInd,i]-X_LMmatrix[imInd,j])
            if denominator != 0:
                cosineArr[0,cosineInd] = np.cos(np.arctan(float(Y_LMmatrix[imInd,i]-Y_LMmatrix[imInd,j])/denominator))
            else:
                cosineArr[0,cosineInd] = 0
            cosineInd +=1
    return cosineArr


distMat = np.zeros((len(imList),X_LMmatrix.shape[1]*(X_LMmatrix.shape[1]-1)/2))
slopeMat = np.zeros((len(imList),X_LMmatrix.shape[1]*(X_LMmatrix.shape[1]-1)/2))
for imInd in range(len(imList)):
    distArr = computeDist(X_LMmatrix,Y_LMmatrix, imInd)
    slopeArr = computeCosine(X_LMmatrix,Y_LMmatrix, imInd)
    distMat[imInd,:] = distArr
    slopeMat[imInd,:] = slopeArr


distSlope = np.concatenate((slopeMat,distMat), axis=1)
allGeometric = np.concatenate((config_feature,distSlope),axis = 1)
#only_w_pixel = np.concatenate((meanHSV,smoothness,distSlope),axis = 1)

In [5]:
Dataset = 'mit'
MODEL = 'geometric'
homePath = '/home/lli-ms/'
np.savetxt(homePath+'/features/'+MODEL+'_geometric_'+Dataset+\
           '_totalFeatures.csv', allGeometric, delimiter=',')

In [79]:
import random
featureMat = allGeometric
randomInd = range(featureMat.shape[0])
random.shuffle(randomInd)
print max(randomInd)
featureMat = featureMat[randomInd,:]
nSamples = featureMat.shape[0]
testRatio = 0.2
valiRatio = 0.1
testFeatures = featureMat[:int(nSamples*testRatio),:]
trainFeatures = featureMat[int(nSamples*testRatio):,:]
valiFeatures = featureMat[-int(trainFeatures.shape[0]*valiRatio):,:]
trainFeatures = trainFeatures[:-int(trainFeatures.shape[0]*valiRatio),:]
print valiFeatures.shape, trainFeatures.shape, testFeatures.shape
radomImlist = np.asarray(imList)[randomInd]

explained_variance = 1000
sklearn_pca = sklearnPCA(n_components=explained_variance, whiten  = True)
trainfeature_transf = sklearn_pca.fit_transform(trainFeatures)
print 'The number of PCs needed to retain %.3f variance is %d.' \
      % (explained_variance, trainfeature_transf.shape[1])
print sklearn_pca.components_.shape
print trainfeature_transf.shape
testfeature_tansf = sklearn_pca.transform(testFeatures)
print testfeature_tansf.shape
valifeature_transf = sklearn_pca.transform(valiFeatures)
#component_std = np.std(sklearn_pca.components_,axis = 0,dtype=np.float32)
# import scipy
# u,s,v = scipy.sparse.linalg.svds(featureMat, k =sklearn_pca.components_.shape[0], which = 'LM')

np.savetxt(homePath+'/features/'+MODEL+'_geometric_'+Dataset+\
           '_trainFeatures.csv', trainfeature_transf, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_geometric_'+Dataset+\
           '_testFeatures.csv', testfeature_tansf, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_geometric_'+Dataset+\
           '_valiFeatures.csv', valifeature_transf, delimiter=',')

2206
(176, 4593) (1590, 4593) (441, 4593)
The number of PCs needed to retain 1000.000 variance is 1000.
(1000, 4593)
(1590, 1000)
(441, 1000)


In [80]:
import pandas as pd
socialMeasures = '../Result/mit/socialMeasures.csv'
socialMeasures = pd.read_csv(socialMeasures,index_col = 0)
socialAttr = socialMeasures.columns.tolist()
delElement = ['subage.1', 'submale.1', 'subrace.1','subage', 'submale',\
              'subrace','catch', 'catchAns','catch.1','catchAns.1']
social2Attr = [x for x in socialAttr if x not in delElement]
socialMeasuresClean = socialMeasures.loc[:,social2Attr].as_matrix()
#print socialMeasuresClean
np.savetxt('../Result/mit/socialMeasuresClean.csv', socialMeasuresClean, delimiter=',')

attr= social2Attr[9]
mean_rating = [socialMeasures.loc[imList[i][31:],attr] for i in range(len(imList))\
               if imList[i][31:] in socialMeasures.index]
mean_rating = map(float, mean_rating)
mean_rating = np.array(mean_rating)

radomRating = mean_rating[randomInd]
testRating = radomRating[:int(nSamples*testRatio)]
trainRating = radomRating[int(nSamples*testRatio):]
valiRating = radomRating[-int(trainRating.shape[0]*valiRatio):]
trainSubRating = trainRating[:-int(trainRating.shape[0]*valiRatio)]

print trainSubRating.shape,testRating.shape,valiRating.shape
print randomInd.index(1)

print radomImlist[ randomInd.index(randI)],radomRating[ randomInd.index(randI)]
print imList[randomInd[ randomInd.index(randI)]],mean_rating[randomInd[ randomInd.index(randI)]]
print socialMeasures.loc[imList[randomInd[ randomInd.index(randI)]][31:],attr]

(1590,) (441,) (176,)
1126
../../MIT2kFaceDataset/2kfaces/Google_1_Delores Friel_1_oval.jpg 5.733333
../../MIT2kFaceDataset/2kfaces/Google_1_Delores Friel_1_oval.jpg 5.733333
5.733333


In [81]:
modelList = []
optFeaNumList = []
import sys
#print sys.path
# local
# PkgPath = '/Users/Olivialinlin/Documents/Github/attractiveness_datamining/linjieCode/code'
# server

PkgPath = homePath+'attractiveness_datamining/linjieCode/code'

if PkgPath not in sys.path:
    sys.path.insert(0, PkgPath)
#print sys.path
from xVal_train_test import Train_Test

import sklearn
import numpy as np
for attr in social2Attr:
    print attr
    #attr= social2Attr[9]
    mean_rating = [socialMeasures.loc[imList[i][31:],attr] for i in range(len(imList))\
               if imList[i][31:] in socialMeasures.index]
    mean_rating = map(float, mean_rating)
    mean_rating = np.array(mean_rating)
    radomRating = mean_rating[randomInd]
    testRating = radomRating[:int(nSamples*testRatio)]
    trainRating = radomRating[int(nSamples*testRatio):]
    valiRating = radomRating[-int(trainRating.shape[0]*valiRatio):]
    trainSubRating = trainRating[:-int(trainRating.shape[0]*valiRatio)]

    baseLine = mean_rating.mean()
    print 'mean rating: ', baseLine

    predictionModel = sklearn.linear_model.RidgeCV(alphas=np.logspace(-3,2,num=20), fit_intercept=True)
    myModel,optFeaNum = Train_Test(trainSubRating,testRating,valiRating,trainfeature_transf, \
                                   testfeature_tansf,valifeature_transf,xVal = True,numTrain = 1,\
                                   pModel = predictionModel,getMaxMin = False,MODEL= MODEL, \
                                   plotPredActual = False,returnModel = True)
    modelList.append(myModel)
    optFeaNumList.append(optFeaNum)

atypical
mean rating:  4.08446414454
Correlation:  0.428337524808
num of features:  28
Spearman Correlation:  0.419444478825
num of features:  28
R^2 score:  0.180219387586
num of features:  28
MSE:  0.405125875821
num of features:  28
**************************Result of train and test**************************************
number of features: 28
On test set:
Residual sum of squares: 0.36
Variance score is: 0.17
Correlation between predicted ratings and actual ratings is: 0.4154
Spearman Correlation between predicted ratings and actual ratings is: 0.4130
 
On training set:
Residual sum of squares: 0.38
Variance score is: 0.18
Correlation between predicted ratings and actual ratings is: 0.4281
Spearman Correlation between predicted ratings and actual ratings is: 0.4072
****************************************************************************************
boring
mean rating:  4.28997636656
Correlation:  0.629427329594
num of features:  173
Spearman Correlation:  0.587568111242
num of fe

In [2]:
# Make sure that caffe is on the python path:
#homePath = '/raid/linjieli/'
homePath = '/home/lli-ms/'
caffe_root = homePath+'caffe/'
pretrained_model_root = homePath+'caffe/'

# run this line one time only!
import sys
caffePython = pretrained_model_root + 'python'
if caffePython not in sys.path:
    sys.path.insert(0, caffePython)


import caffe
# Load mean
mu = np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy')
mu = mu.mean(1).mean(1)  # average over pixels to obtain the mean (BGR) pixel values
print 'mean-subtracted values:', zip('BGR', mu)

# Load the trained net
MODEL = 'vgg16' #'caffeNet','vgg16','vggFace' or 'faceSNN'

saveFigPath = '../Result/'+Dataset+'/'+MODEL
if not os.path.exists(saveFigPath):
    os.makedirs(saveFigPath)
    
if MODEL == 'vgg16':
    MODEL_FILE = caffe_root +'models/VGG16/VGG_ILSVRC_16_layers_deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/VGG16/VGG_ILSVRC_16_layers.caffemodel'
elif MODEL == 'caffeNet':
    MODEL_FILE = caffe_root + 'models/bvlc_reference_caffenet/deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'
elif MODEL == 'vggFace':
    MODEL_FILE = caffe_root + 'models/VGGFACE/VGG_CNN_F_deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/VGGFACE/VGG_CNN_F.caffemodel'
    MEAN_FILE = caffe_root + 'models/VGGFACE/VGG_mean.binaryproto'
else:
    MODEL = 'faceSNN'
    MODEL_FILE = caffe_root +'models/sraonet/siamese_lecun_deploy.prototxt'
    PRETRAINED_FILE = caffe_root + 'models/sraonet/snapshots/sraonet_lecun_gd_sub2_iter_100000.caffemodel'
    
caffe.set_device(1)
caffe.set_mode_gpu()
if not os.path.isfile(PRETRAINED_FILE):
    print("No caffemodel!!!")
elif not os.path.isfile(MODEL_FILE):
    print("No MODEL !!!")
else:
    print "Defining the net!"
    net = caffe.Net(MODEL_FILE,
                PRETRAINED_FILE,
                caffe.TEST)
# input preprocessing: 'data' is the name of the input blob == net.inputs[0]
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
if MODEL != 'faceSNN':
    # subtract the dataset-mean value in each channel
    transformer.set_mean('data', mu)
for layer_name, param in net.params.iteritems():
    print layer_name + '\t' + str(param[0].data.shape), str(param[1].data.shape)
transformer.set_transpose('data', (2,0,1))
# the reference model operates on images in [0,255] range instead of [0,1]
transformer.set_raw_scale('data', 255) 
# the reference model has channels in BGR order instead of RGB
transformer.set_channel_swap('data', (2,1,0))

mean-subtracted values: [('B', 104.0069879317889), ('G', 116.66876761696767), ('R', 122.6789143406786)]
Defining the net!
conv1_1	(64, 3, 3, 3) (64,)
conv1_2	(64, 64, 3, 3) (64,)
conv2_1	(128, 64, 3, 3) (128,)
conv2_2	(128, 128, 3, 3) (128,)
conv3_1	(256, 128, 3, 3) (256,)
conv3_2	(256, 256, 3, 3) (256,)
conv3_3	(256, 256, 3, 3) (256,)
conv4_1	(512, 256, 3, 3) (512,)
conv4_2	(512, 512, 3, 3) (512,)
conv4_3	(512, 512, 3, 3) (512,)
conv5_1	(512, 512, 3, 3) (512,)
conv5_2	(512, 512, 3, 3) (512,)
conv5_3	(512, 512, 3, 3) (512,)
fc6	(4096, 25088) (4096,)
fc7	(4096, 4096) (4096,)
fc8	(1000, 4096) (1000,)


In [3]:

if MODEL == 'vgg16' or MODEL == 'vggFace':
    imgeReshape = [224,224]
    featureLayer = 'conv5_2' 
elif MODEL == 'caffeNet':
    imgeReshape = [227,227]
    featureLayer = 'fc6'
else:
    imgeReshape = [56,46]
    featureLayer = 'fc6'
if 'fc' in featureLayer:
    featureNum = net.params[featureLayer][1].data.shape[0]
else:
    featureNum = net.blobs[featureLayer].data.flatten().shape[0]/net.blobs[featureLayer].data.shape[0]
    
if Dataset == 'twin':
    features = np.zeros([4,len(imList)/4,featureNum])
    perImNum = len(imList)/4
    img_type_num = {}
    img_type_index = {}
    img_type_list = {}
    type_index = 0
else:
    features = np.zeros([len(imList),featureNum])
print featureNum

100352


In [4]:
totalNum = 0

# print len(imList)
for img in imList:
    imgName = os.path.basename(img)
    if imgName.endswith(('.jpg','.png')):
        input_image = caffe.io.load_image(img)
        net.blobs['data'].reshape(1,3,imgeReshape[0],imgeReshape[1])
        net.blobs['data'].data[...] = transformer.preprocess('data', input_image)
        out = net.forward()
        feat = net.blobs[featureLayer].data
        if Dataset =='twin':
            img_type = int(imgName[7:-4])/perImNum
            img_index = int(imgName[7:-4])%perImNum
            #print 'img_type:',img_type
            if img_type in img_type_num.keys():
                img_type_num[img_type] = img_type_num[img_type] + 1
                img_type_list[img_type][img_index] = img
            else:
                img_type_num[img_type] = 0
                img_type_list[img_type] = [None]*perImNum
                img_type_index[img_type] = type_index
                type_index +=1
            #print 'img_type_index:',img_type_index[img_type]
            features[img_type_index[img_type],img_type_num[img_type]] = feat.flatten()
        else:
            # need to be further revised!
            features[totalNum] = feat.flatten()
            #print features[totalNum]
        totalNum +=1
    else:
        print img
#print len(img_type_num)
print totalNum
#print img_type_list

if Dataset == 'twin':
    featureMat = np.zeros((totalNum,featureNum))
    k = 0
    for i in range(features.shape[0]):
        for j in range(features[i].shape[0]):
            if sum(features[i,j,:])!=0:
                featureMat[k,:] = features[i,j,:]
                k +=1
else:
    featureMat = features
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'totalFeatures_2207.csv', featureMat, delimiter=',')

2207


In [None]:
#print featureMat
featureMat = featureMat[randomInd,:]
nSamples = featureMat.shape[0]
testRatio = 0.2
valiRatio = 0.1
testFeatures = featureMat[:int(nSamples*testRatio),:]
trainFeatures = featureMat[int(nSamples*testRatio):,:]
valiFeatures = featureMat[-int(trainFeatures.shape[0]*valiRatio):,:]
trainFeatures = trainFeatures[:-int(trainFeatures.shape[0]*valiRatio),:]
print valiFeatures.shape, trainFeatures.shape, testFeatures.shape
radomImlist = np.asarray(imList)[randomInd]
testIm = radomImlist[:int(nSamples*testRatio)]
trainIm = radomImlist[int(nSamples*testRatio):]
print testIm.shape, trainIm.shape

In [None]:
if MODEL != 'faceSNN':
    explained_variance = 0.95
else:
    explained_variance = featureNum

sklearn_pca = sklearnPCA(n_components=explained_variance, whiten  = True)
trainfeature_transf = sklearn_pca.fit_transform(trainFeatures)
print 'The number of PCs needed to retain %.3f variance is %d.' \
      % (explained_variance, trainfeature_transf.shape[1])

In [None]:
print sklearn_pca.components_.shape
print trainfeature_transf.shape
testfeature_tansf = sklearn_pca.transform(testFeatures)
print testfeature_tansf.shape
valifeature_transf = sklearn_pca.transform(valiFeatures)
#component_std = np.std(sklearn_pca.components_,axis = 0,dtype=np.float32)
# import scipy
# u,s,v = scipy.sparse.linalg.svds(featureMat, k =sklearn_pca.components_.shape[0], which = 'LM')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_trainFeatures.csv', trainfeature_transf, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_testFeatures.csv', testfeature_tansf, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_valiFeatures.csv', valifeature_transf, delimiter=',')
whiten_component = sklearn_pca.components_
#print whiten_component
newMat = trainFeatures.dot(whiten_component.T)
bias = np.mean(newMat,axis = 0)
# print bias
newNewMat = newMat - bias
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_weights.csv', whiten_component, delimiter=',')
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'other_biases.csv', bias, delimiter=',')

In [None]:
def writeFile(imList, rating,fName):
    text_file = open(fName, "w")
    for i in range(imList.shape[0]):
        d = homePath+'attractiveness_datamining/'+imList[i][6:]+' '+str(rating[i])+'\n'
        text_file.write(d)
    text_file.close()
modelList = []
optFeaNumList = []
import sys
#print sys.path
# local
# PkgPath = '/Users/Olivialinlin/Documents/Github/attractiveness_datamining/linjieCode/code'
# server
PkgPath = homePath+'attractiveness_datamining/linjieCode/code'

if PkgPath not in sys.path:
    sys.path.insert(0, PkgPath)
#print sys.path
from xVal_train_test import Train_Test

import sklearn
import numpy as np
for attr in social2Attr:
    print attr
    #attr= social2Attr[9]
    mean_rating = socialMeasures.loc[:,attr].tolist()
    mean_rating = map(float, mean_rating)
    mean_rating = np.array(mean_rating)
    radomRating = mean_rating[randomInd]
    testRating = radomRating[:int(nSamples*testRatio)]
    trainRating = radomRating[int(nSamples*testRatio):]
    valiRating = radomRating[-int(trainRating.shape[0]*valiRatio):]
    trainSubRating = trainRating[:-int(trainRating.shape[0]*valiRatio)]
    #writeFile(trainIm,trainRating,'../list/'+attr+'_train.txt')
    #writeFile(testIm,testRating,'../list/'+attr+'_test.txt')
    writeFile(trainIm,trainRating,'../data_list/'+attr+'_train.txt')
    writeFile(testIm,testRating,'../data_list/'+attr+'_test.txt')
    baseLine = mean_rating.mean()
    print 'mean rating: ', baseLine

    predictionModel = sklearn.linear_model.RidgeCV(alphas=np.logspace(-3,2,num=20), fit_intercept=True)
    myModel,optFeaNum = Train_Test(trainSubRating,testRating,valiRating,trainfeature_transf, \
                                   testfeature_tansf,valifeature_transf,xVal = True,numTrain = 1,\
                                   pModel = predictionModel,getMaxMin = False,MODEL= MODEL, \
                                   plotPredActual = False,returnModel = True)
    modelList.append(myModel)
    optFeaNumList.append(optFeaNum)

In [None]:
np.savetxt(homePath+'/features/'+MODEL+'_'+featureLayer+'_'+Dataset+\
           'optNumF.csv', np.asarray(optFeaNumList), delimiter=',', fmt='%d')
print optFeaNumList