In [1]:
import pandas as pd
import numpy as np
import scipy.io
import datetime
from datetime import tzinfo, timedelta, datetime

import warnings
warnings.filterwarnings('ignore')

import os.path
from os import path

In [2]:
mat = scipy.io.loadmat('imdb_data/imdb.mat')

In [3]:
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'imdb'])

In [4]:
mat_data = mat['imdb']

In [5]:
mtype = mat_data.dtype

In [6]:
mtype.names

('dob',
 'photo_taken',
 'full_path',
 'gender',
 'name',
 'face_location',
 'face_score',
 'second_face_score',
 'celeb_names',
 'celeb_id')

In [7]:
len(mat_data[0][0])

10

In [8]:
column_titles = list(mtype.names)

In [9]:
image_info = {}

for i in range(0, len(column_titles)):
    image_info[column_titles[i]] = mat_data[0][0][i]

In [10]:
image_info['dob'][0]

array([693726, 693726, 693726, ..., 726831, 726831, 726831], dtype=int32)

In [11]:
birth_date = image_info['dob'][0]
year_taken = image_info['photo_taken'][0]
file_path = [item for sublist in image_info['full_path'][0] for item in sublist]
gender = image_info['gender'][0]
name = [item for sublist in image_info['name'][0] for item in sublist]
face_location = [item for sublist in image_info['face_location'][0] for item in sublist]
face_score = image_info['face_score'][0]
second_face_score = image_info['second_face_score'][0]
columns = column_titles

image_data_dictionary = {columns[4]: name, columns[0]: birth_date, columns[3]: gender, columns[1]: year_taken, 'file_path': file_path, columns[5]: face_location, columns[6]: face_score, columns[7]: second_face_score}


In [12]:
photo_info = pd.DataFrame(image_data_dictionary)

In [13]:
photo_info.head()

Unnamed: 0,name,dob,gender,photo_taken,file_path,face_location,face_score,second_face_score
0,Fred Astaire,693726,1.0,1968,01/nm0000001_rm124825600_1899-5-10_1968.jpg,"[1072.926, 161.838, 1214.7839999999999, 303.69...",1.459693,1.118973
1,Fred Astaire,693726,1.0,1970,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,"[477.184, 100.352, 622.592, 245.76]",2.543198,1.852008
2,Fred Astaire,693726,1.0,1968,01/nm0000001_rm577153792_1899-5-10_1968.jpg,"[114.96964308962852, 114.96964308962852, 451.6...",3.455579,2.98566
3,Fred Astaire,693726,1.0,1968,01/nm0000001_rm946909184_1899-5-10_1968.jpg,"[622.8855056426588, 424.21750383700805, 844.33...",1.872117,
4,Fred Astaire,693726,1.0,1968,01/nm0000001_rm980463616_1899-5-10_1968.jpg,"[1013.8590023603723, 233.8820422075853, 1201.5...",1.158766,


In [14]:
#Remove rows where the matlab birthdays will not convert to pandas because of incorrect information
photo_info = photo_info[photo_info['dob'] >= 2000].reset_index(drop=True)

In [15]:
def convert_matlab_dob(birthday):
    return (datetime.fromordinal(int(birthday)) + timedelta(days=birthday%1) - timedelta(days = 366)).date()

In [16]:
photo_info['dob'] = photo_info['dob'].apply(convert_matlab_dob)

In [17]:
#A check to see the null values in the 'python_dob' column after the birthday conversion
set(photo_info.name[photo_info.dob.isnull()])
photo_info[['name', 'dob']][photo_info.dob.isnull()].groupby(['name', 'dob']).first()

In [18]:
len(photo_info[['name', 'gender']][~photo_info['gender'].isin([0,1])].groupby(['name']))

1201

In [19]:
photo_info = photo_info[photo_info['gender'].isin([0,1])].reset_index(drop=True)

In [20]:
photo_info.drop(columns = ['second_face_score'], axis = 1, inplace = True)

In [21]:
year_birth_extraction = lambda x: (int(x.year))
photo_info['age_when_taken'] = photo_info.photo_taken - photo_info.dob.map(year_birth_extraction)

In [22]:
photo_info = photo_info[['name', 'dob', 'gender', 'photo_taken', 'age_when_taken', 'file_path', 'face_location', 'face_score']]

photo_info.reset_index(drop=True)
photo_info.head()

Unnamed: 0,name,dob,gender,photo_taken,age_when_taken,file_path,face_location,face_score
0,Fred Astaire,1899-05-10,1.0,1968,69,01/nm0000001_rm124825600_1899-5-10_1968.jpg,"[1072.926, 161.838, 1214.7839999999999, 303.69...",1.459693
1,Fred Astaire,1899-05-10,1.0,1970,71,01/nm0000001_rm3343756032_1899-5-10_1970.jpg,"[477.184, 100.352, 622.592, 245.76]",2.543198
2,Fred Astaire,1899-05-10,1.0,1968,69,01/nm0000001_rm577153792_1899-5-10_1968.jpg,"[114.96964308962852, 114.96964308962852, 451.6...",3.455579
3,Fred Astaire,1899-05-10,1.0,1968,69,01/nm0000001_rm946909184_1899-5-10_1968.jpg,"[622.8855056426588, 424.21750383700805, 844.33...",1.872117
4,Fred Astaire,1899-05-10,1.0,1968,69,01/nm0000001_rm980463616_1899-5-10_1968.jpg,"[1013.8590023603723, 233.8820422075853, 1201.5...",1.158766


In [23]:
#A check to see if the file path exists
file_exists = 0
file_not_exist = 0

for i in photo_info.file_path:
    if path.exists('imdb_data/' + i):
        file_exists += 1
    else: 
        file_not_exist += 1
        
print(file_exists)
print(file_not_exist)

452132
0


In [24]:
photo_info.to_csv('photo_metadata.csv', index = False)