In [82]:
import scipy.io as sio

In [83]:
matdata=sio.loadmat('./IMDB/imdb.mat')
items = matdata['imdb'][0][0]

In [84]:
dob = items[0][0]
photo_taken = items[1][0]
full_path = items[2][0]
gender = items[3][0]
name = items[4][0]
face_location = items[5][0]
face_score = items[6][0]
second_face_score = items[7][0]
celeb_names = items[8][0]
celeb_id = items[9][0]

In [85]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [86]:
def flatten(a):
    return [x[0] for x in a]

def dateConvert(matlab_datenum):
    if matlab_datenum > 366:
        return datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1) - timedelta(days = 366)
    else:
        return np.nan

In [87]:
df = pd.DataFrame({'path': flatten(full_path),
                   'id': celeb_id,
                   'name': flatten(name), 
                   'dob': dob,
                   'gender': gender,
                   'score1': face_score,
                   'score2': second_face_score,
                   'pic_date': photo_taken,
                   'region': flatten(face_location),
                   
                   })

In [88]:
df.dob = df.dob.astype(int).apply(dateConvert)
df.pic_date = pd.to_datetime(df.pic_date, format='%Y')
df['age'] = (df.pic_date - df.dob)/np.timedelta64(1,'Y')
df.dropna(axis=0, how='any', subset=['path','gender'], inplace=True)
df['gender'] = df.gender.astype(int)
df['path'] = df.path.astype(str)

In [89]:
df.head(3)

Unnamed: 0,path,id,name,dob,gender,score1,score2,pic_date,region,age
0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,6488,Fred Astaire,1899-05-10,1,1.459693,1.118973,1968-01-01,"[1072.926, 161.838, 1214.7839999999999, 303.69...",68.644804
1,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,6488,Fred Astaire,1899-05-10,1,2.543198,1.852008,1970-01-01,"[477.184, 100.352, 622.592, 245.76]",70.646215
2,01/nm0000001_rm577153792_1899-5-10_1968.jpg,6488,Fred Astaire,1899-05-10,1,3.455579,2.98566,1968-01-01,"[114.96964308962852, 114.96964308962852, 451.6...",68.644804


In [90]:
df.shape

(452261, 10)

In [91]:
# Drop data points where there is no score (crop failure)
df = df[df.region.apply(lambda x: (x[0]!=1 and x[1]!=1))]

In [92]:
# Select only rows where they are only 1 face score (no second person)
df = df[pd.isnull(df.score2)]

In [93]:
df.shape

(181626, 10)

In [94]:
# randomly remove some male samples to balance male and female data points
frac = (df[df.gender==1].shape[0]-df[df.gender==0].shape[0])/(df[df.gender==1].shape[0])
df=df.drop(df.query('gender == 1').sample(frac=frac, random_state=1 ).index)


In [95]:
# male data points:
df[df.gender==1].shape

(81668, 10)

In [96]:
# female data points:
df[df.gender==0].shape

(81668, 10)

In [97]:
df.dtypes

path                object
id                  uint16
name                object
dob         datetime64[ns]
gender               int32
score1             float64
score2             float64
pic_date    datetime64[ns]
region              object
age                float64
dtype: object

In [98]:
import pickle

In [99]:
df.shape

(163336, 10)

#### Split dataframe

In [100]:
def train_test_split(df, sample_frac, train_frac, rand_state):
    """downsize a given df. split into train and test sets."""
    small_df = df.sample(frac=sample_frac, random_state=rand_state)
    df_train = small_df.sample(frac=train_frac, random_state=rand_state)
    df_test = small_df.loc[~small_df.index.isin(df_train.index), :]
    return (df_train, df_test)

#### Using only 1 percent of the total data, split into train-test sets (95-5%)

In [101]:
(df_1pct_train, df_1pct_test) = train_test_split(df, 0.01, 1, 1)

In [102]:
pickle_out = open('df_1pct.pickle', 'wb')
pickle.dump((df_1pct_train, df_1pct_test), pickle_out)
pickle_out.close()

In [103]:
# To load from file:
pickle_in = open('df_1pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [104]:
print(my_df_train.shape, my_df_test.shape)

(1633, 10) (0, 10)


In [105]:
pickle_in.close()

#### Using 5 percent of the total data, split into train-test sets (95-5%)

In [106]:
(df_5pct_train, df_5pct_test) = train_test_split(df, 0.05, 0.95, 1)

In [107]:
pickle_out = open('df_5pct.pickle', 'wb')
pickle.dump((df_5pct_train, df_5pct_test), pickle_out)
pickle_out.close()

In [108]:
# To load from file:
pickle_in = open('df_5pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [109]:
print(my_df_train.shape, my_df_test.shape)

(7759, 10) (408, 10)


#### Using 10 percent of the total data, split into train-test sets (95-5%)

In [110]:
(df_10pct_train, df_10pct_test) = train_test_split(df, 0.1, 0.95, 1)

In [111]:
pickle_out = open('df_10pct.pickle', 'wb')
pickle.dump((df_10pct_train, df_10pct_test), pickle_out)
pickle_out.close()

In [112]:
# To load from file:
pickle_in = open('df_10pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [113]:
print(my_df_train.shape, my_df_test.shape)

(15517, 10) (817, 10)


#### Using 20 percent of the total data, split into train-test sets (95-5%)

In [114]:
(df_20pct_train, df_20pct_test) = train_test_split(df, 0.2, 0.95, 1)

In [115]:
pickle_out = open('df_20pct.pickle', 'wb')
pickle.dump((df_20pct_train, df_20pct_test), pickle_out)
pickle_out.close()

In [116]:
# To load from file:
pickle_in = open('df_20pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [117]:
print(my_df_train.shape, my_df_test.shape)

(31034, 10) (1633, 10)


#### Using all data, split into train-test sets (95-5%)

In [118]:
(df_all_train, df_all_test) = train_test_split(df, 1, 0.95, 1)

In [119]:
pickle_out = open('df_all.pickle', 'wb')
pickle.dump((df_all_train, df_all_test), pickle_out)
pickle_out.close()

In [120]:
# To load from file:
pickle_in = open('df_all.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [121]:
print(my_df_train.shape, my_df_test.shape)

(155169, 10) (8167, 10)
