In [1]:
import scipy.io as sio

In [2]:
matdata=sio.loadmat('./IMDB/imdb.mat')
items = matdata['imdb'][0][0]

In [3]:
dob = items[0][0]
photo_taken = items[1][0]
full_path = items[2][0]
gender = items[3][0]
name = items[4][0]
face_location = items[5][0]
face_score = items[6][0]
second_face_score = items[7][0]
celeb_names = items[8][0]
celeb_id = items[9][0]

In [4]:
import pandas as pd
from datetime import datetime, timedelta

In [5]:
def flatten(a):
    return [x[0] for x in a]

def dateConvert(matlab_datenum):
    if matlab_datenum > 366:
        return datetime.fromordinal(int(matlab_datenum)) + timedelta(days=matlab_datenum%1) - timedelta(days = 366)
    else:
        return np.nan

In [6]:
df = pd.DataFrame({'path': flatten(full_path),
                   'id': celeb_id,
                   'name': flatten(name), 
                   'dob': dob,
                   'gender': gender,
                   'score1': face_score,
                   'score2': second_face_score,
                   'pic_date': photo_taken,
                   'region': flatten(face_location),
                   
                   })

In [7]:
df.dob = df.dob.astype(int).apply(dateConvert)
df.pic_date = pd.to_datetime(df.pic_date, format='%Y')

In [8]:
df['age'] = (df.pic_date - df.dob)/np.timedelta64(1,'Y')

In [9]:
df.dtypes

path                object
id                  uint16
name                object
dob         datetime64[ns]
gender             float64
score1             float64
score2             float64
pic_date    datetime64[ns]
region              object
age                float64
dtype: object

In [10]:
import pickle

In [26]:
df.shape

(460723, 10)

#### Split dataframe

In [28]:
def train_test_split(df, sample_frac, train_frac, rand_state):
    """downsize a given df. split into train and test sets."""
    small_df = df.sample(frac=sample_frac, random_state=rand_state)
    df_train = small_df.sample(frac=train_frac, random_state=rand_state)
    df_test = small_df.loc[~small_df.index.isin(df_train.index), :]
    return (df_train, df_test)

#### Using only 1 percent of the total data, split into train-test sets (95-5%)

In [29]:
(df_1pct_train, df_1pct_test) = train_test_split(df, 0.01, 0.95, 1)

In [30]:
pickle_out = open('df_1pct.pickle', 'wb')
pickle.dump((df_1pct_train, df_1pct_test), pickle_out)
pickle_out.close()

In [31]:
# To load from file:
pickle_in = open('df_1pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [32]:
print(my_df_train.shape, my_df_test.shape)

(4377, 10) (230, 10)


#### Using 5 percent of the total data, split into train-test sets (95-5%)

In [33]:
(df_5pct_train, df_5pct_test) = train_test_split(df, 0.05, 0.95, 1)

In [34]:
pickle_out = open('df_5pct.pickle', 'wb')
pickle.dump((df_5pct_train, df_5pct_test), pickle_out)
pickle_out.close()

In [35]:
# To load from file:
pickle_in = open('df_5pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [36]:
print(my_df_train.shape, my_df_test.shape)

(21884, 10) (1152, 10)


#### Using 10 percent of the total data, split into train-test sets (95-5%)

In [37]:
(df_10pct_train, df_10pct_test) = train_test_split(df, 0.1, 0.95, 1)

In [38]:
pickle_out = open('df_10pct.pickle', 'wb')
pickle.dump((df_10pct_train, df_10pct_test), pickle_out)
pickle_out.close()

In [39]:
# To load from file:
pickle_in = open('df_10pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [40]:
print(my_df_train.shape, my_df_test.shape)

(43768, 10) (2304, 10)


#### Using 20 percent of the total data, split into train-test sets (95-5%)

In [45]:
(df_20pct_train, df_20pct_test) = train_test_split(df, 0.2, 0.95, 1)

In [46]:
pickle_out = open('df_20pct.pickle', 'wb')
pickle.dump((df_20pct_train, df_20pct_test), pickle_out)
pickle_out.close()

In [47]:
# To load from file:
pickle_in = open('df_20pct.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [48]:
print(my_df_train.shape, my_df_test.shape)

(87538, 10) (4607, 10)


#### Using all data, split into train-test sets (95-5%)

In [49]:
(df_all_train, df_all_test) = train_test_split(df, 1, 0.95, 1)

In [50]:
pickle_out = open('df_all.pickle', 'wb')
pickle.dump((df_all_train, df_all_test), pickle_out)
pickle_out.close()

In [51]:
# To load from file:
pickle_in = open('df_all.pickle', 'rb')
my_df_train, my_df_test = pickle.load(pickle_in)

In [52]:
print(my_df_train.shape, my_df_test.shape)

(437687, 10) (23036, 10)
