In [1]:
import scipy.io as sio

In [2]:
matdata=sio.loadmat('./IMDB/imdb.mat')
items = matdata['imdb'][0][0]

In [3]:
dob = items[0][0]
photo_taken = items[1][0]
full_path = items[2][0]
gender = items[3][0]
name = items[4][0]
face_location = items[5][0]
face_score = items[6][0]
second_face_score = items[7][0]
celeb_names = items[8][0]
celeb_id = items[9][0]

In [4]:
import pandas as pd
from datetime import datetime, timedelta

In [5]:
def flatten(a):
    return [x[0] for x in a]

def dateConvert(matlab_datenum):
    if matlab_datenum > 366:
        return datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1) - timedelta(days = 366)
    else:
        return np.nan

In [11]:
df = pd.DataFrame({'path': flatten(full_path),
                   'id': celeb_id,
                   'name': flatten(name), 
                   'dob': dob,
                   'gender': gender,
                   'score1': face_score,
                   'score2': second_face_score,
                   'pic_date': photo_taken,
                   'region': flatten(face_location),
                   
                   })

In [12]:
df.dob = df.dob.astype(int).apply(dateConvert)
df.pic_date = pd.to_datetime(df.pic_date, format='%Y')
df['age'] = (df.pic_date - df.dob)/np.timedelta64(1,'Y')
df.dropna(axis=0, how='any', subset=['path','gender'], inplace=True)
df['gender'] = df.gender.astype(int)
df['path'] = df.path.astype(str)

In [13]:
df.head(5)

Unnamed: 0,path,id,name,dob,gender,score1,score2,pic_date,region,age
0,01/nm0000001_rm124825600_1899-5-10_1968.jpg,6488,Fred Astaire,1899-05-10,1,1.459693,1.118973,1968-01-01,"[1072.926, 161.838, 1214.7839999999999, 303.69...",68.644804
1,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,6488,Fred Astaire,1899-05-10,1,2.543198,1.852008,1970-01-01,"[477.184, 100.352, 622.592, 245.76]",70.646215
2,01/nm0000001_rm577153792_1899-5-10_1968.jpg,6488,Fred Astaire,1899-05-10,1,3.455579,2.98566,1968-01-01,"[114.96964308962852, 114.96964308962852, 451.6...",68.644804
3,01/nm0000001_rm946909184_1899-5-10_1968.jpg,6488,Fred Astaire,1899-05-10,1,1.872117,,1968-01-01,"[622.8855056426588, 424.21750383700805, 844.33...",68.644804
4,01/nm0000001_rm980463616_1899-5-10_1968.jpg,6488,Fred Astaire,1899-05-10,1,1.158766,,1968-01-01,"[1013.8590023603723, 233.8820422075853, 1201.5...",68.644804


In [14]:
df.dtypes

path                object
id                  uint16
name                object
dob         datetime64[ns]
gender               int64
score1             float64
score2             float64
pic_date    datetime64[ns]
region              object
age                float64
dtype: object

In [15]:
import pickle

In [16]:
df.shape

(452261, 10)

#### Split dataframe

In [17]:
def train_test_split(df, sample_frac, train_frac, rand_state):
    """downsize a given df. split into train and test sets."""
    small_df = df.sample(frac=sample_frac, random_state=rand_state)
    df_train = small_df.sample(frac=train_frac, random_state=rand_state)
    df_test = small_df.loc[~small_df.index.isin(df_train.index), :]
    return (df_train, df_test)

#### Using only 1 percent of the total data, split into train-test sets (95-5%)

In [18]:
(df_1pct_train, df_1pct_test) = train_test_split(df, 0.01, 0.95, 1)

In [19]:
pickle_out = open('df_1pct.pickle', 'wb')
pickle.dump((df_1pct_train, df_1pct_test), pickle_out)
pickle_out.close()

In [20]:
# To load from file:
pickle_in = open('df_1pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [21]:
print(my_df_train.shape, my_df_test.shape)

(4297, 10) (226, 10)


#### Using 5 percent of the total data, split into train-test sets (95-5%)

In [22]:
(df_5pct_train, df_5pct_test) = train_test_split(df, 0.05, 0.95, 1)

In [23]:
pickle_out = open('df_5pct.pickle', 'wb')
pickle.dump((df_5pct_train, df_5pct_test), pickle_out)
pickle_out.close()

In [24]:
# To load from file:
pickle_in = open('df_5pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [25]:
print(my_df_train.shape, my_df_test.shape)

(21482, 10) (1131, 10)


#### Using 10 percent of the total data, split into train-test sets (95-5%)

In [26]:
(df_10pct_train, df_10pct_test) = train_test_split(df, 0.1, 0.95, 1)

In [27]:
pickle_out = open('df_10pct.pickle', 'wb')
pickle.dump((df_10pct_train, df_10pct_test), pickle_out)
pickle_out.close()

In [28]:
# To load from file:
pickle_in = open('df_10pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [29]:
print(my_df_train.shape, my_df_test.shape)

(42965, 10) (2261, 10)


#### Using 20 percent of the total data, split into train-test sets (95-5%)

In [30]:
(df_20pct_train, df_20pct_test) = train_test_split(df, 0.2, 0.95, 1)

In [31]:
pickle_out = open('df_20pct.pickle', 'wb')
pickle.dump((df_20pct_train, df_20pct_test), pickle_out)
pickle_out.close()

In [32]:
# To load from file:
pickle_in = open('df_20pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [33]:
print(my_df_train.shape, my_df_test.shape)

(85929, 10) (4523, 10)


#### Using all data, split into train-test sets (95-5%)

In [34]:
(df_all_train, df_all_test) = train_test_split(df, 1, 0.95, 1)

In [35]:
pickle_out = open('df_all.pickle', 'wb')
pickle.dump((df_all_train, df_all_test), pickle_out)
pickle_out.close()

In [36]:
# To load from file:
pickle_in = open('df_all.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [37]:
print(my_df_train.shape, my_df_test.shape)

(429648, 10) (22613, 10)
