In [1]:
# download on https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/

In [2]:
import pandas as pd
import numpy as np
import scipy.io as spio

In [3]:
imdb_matdata = spio.loadmat('../datasets/imdb-wiki/imdb_meta/imdb/imdb.mat')
wiki_matdata = spio.loadmat('../datasets/imdb-wiki/wiki/wiki.mat')

In [None]:
# Information got from IMDB-WIKI homepage

# Dataset Features:
#   dob: date of birth (Matlab serial date number)
#   photo_taken: year when the photo was taken
#   full_path: path to file
#   gender: 0 for female and 1 for male, NaN if unknown
#   name: name of the celebrity
#   face_location: location of the face. To crop the face in Matlab run
#   img(face_location(2):face_location(4),face_location(1):face_location(3),:))
#   face_score: detector score (the higher the better). Inf implies that no face was found in the image and the face_location then just returns the entire image
#   second_face_score: detector score of the face with the second highest score. This is useful to ignore images with more than one face. second_face_score is NaN if no second face was detected.
#   celeb_names (IMDB only): list of all celebrity names
#   celeb_id (IMDB only): index of celebrity name
#
# The age of a person can be calculated based on the date of birth and 
# the time when the photo was taken (note that we assume that the photo was 
# taken in the middle of the year):
#   [age,~]=datevec(datenum(wiki.photo_taken,7,1)-wiki.dob); 

In [4]:
imdb_matdata.keys()

dict_keys(['__header__', '__version__', '__globals__', 'imdb'])

In [5]:
imdb_np_data = imdb_matdata['imdb']

In [6]:
imdb_info_layer = [ set() ] * 4

for l0 in imdb_np_data:
    imdb_info_layer[0].add(f'type: {type(l0)}, len: {len(l0)}')
    for l1 in l0:
        imdb_info_layer[1].add(f'type: {type(l1)}, len: {len(l1)}')
        for l2 in l1:
            imdb_info_layer[2].add(f'type: {type(l2)}, len: {len(l2)}')
            for l3 in l2:
                imdb_info_layer[3].add(f'type: {type(l3)}, len: {len(l3)}')

for i in range(4):
    print(f'Structure on depth {i+1}:')
    for info in imdb_info_layer[i]:
        print(f'\t{info}')

Structure on depth 1:
	type: <class 'numpy.ndarray'>, len: 20284
	type: <class 'numpy.ndarray'>, len: 460723
	type: <class 'numpy.void'>, len: 10
	type: <class 'numpy.ndarray'>, len: 1
Structure on depth 2:
	type: <class 'numpy.ndarray'>, len: 20284
	type: <class 'numpy.ndarray'>, len: 460723
	type: <class 'numpy.void'>, len: 10
	type: <class 'numpy.ndarray'>, len: 1
Structure on depth 3:
	type: <class 'numpy.ndarray'>, len: 20284
	type: <class 'numpy.ndarray'>, len: 460723
	type: <class 'numpy.void'>, len: 10
	type: <class 'numpy.ndarray'>, len: 1
Structure on depth 4:
	type: <class 'numpy.ndarray'>, len: 20284
	type: <class 'numpy.ndarray'>, len: 460723
	type: <class 'numpy.void'>, len: 10
	type: <class 'numpy.ndarray'>, len: 1


In [7]:
for i in range(10):
    print(f'Feature {i}:')
    print(f'\t type: {type(imdb_np_data[0][0][i][0][0])}')
    print(f'\t len: {type(imdb_np_data[0][0][i][0])}')
    print(f'\t sample: {imdb_np_data[0][0][i][0][0]}')

Feature 0:
	 type: <class 'numpy.int32'>
	 len: <class 'numpy.ndarray'>
	 sample: 693726
Feature 1:
	 type: <class 'numpy.uint16'>
	 len: <class 'numpy.ndarray'>
	 sample: 1968
Feature 2:
	 type: <class 'numpy.ndarray'>
	 len: <class 'numpy.ndarray'>
	 sample: ['01/nm0000001_rm124825600_1899-5-10_1968.jpg']
Feature 3:
	 type: <class 'numpy.float64'>
	 len: <class 'numpy.ndarray'>
	 sample: 1.0
Feature 4:
	 type: <class 'numpy.ndarray'>
	 len: <class 'numpy.ndarray'>
	 sample: ['Fred Astaire']
Feature 5:
	 type: <class 'numpy.ndarray'>
	 len: <class 'numpy.ndarray'>
	 sample: [[1072.926  161.838 1214.784  303.696]]
Feature 6:
	 type: <class 'numpy.float64'>
	 len: <class 'numpy.ndarray'>
	 sample: 1.4596929136202572
Feature 7:
	 type: <class 'numpy.float64'>
	 len: <class 'numpy.ndarray'>
	 sample: 1.1189733571573068
Feature 8:
	 type: <class 'numpy.ndarray'>
	 len: <class 'numpy.ndarray'>
	 sample: ["'Lee' George Quinones"]
Feature 9:
	 type: <class 'numpy.uint16'>
	 len: <class 'numpy

In [8]:
wiki_matdata.keys()

dict_keys(['__header__', '__version__', '__globals__', 'wiki'])

In [9]:
wiki_np_data = wiki_matdata['wiki']

In [10]:
wiki_info_layer = [ set() ] * 4

for l0 in wiki_np_data:
    wiki_info_layer[0].add(f'type: {type(l0)}, len: {len(l0)}')
    for l1 in l0:
        wiki_info_layer[1].add(f'type: {type(l1)}, len: {len(l1)}')
        for l2 in l1:
            wiki_info_layer[2].add(f'type: {type(l2)}, len: {len(l2)}')
            for l3 in l2:
                wiki_info_layer[3].add(f'type: {type(l3)}, len: {len(l3)}')

for i in range(4):
    print(f'Structure on depth {i+1}:')
    for info in wiki_info_layer[i]:
        print(f'\t{info}')

Structure on depth 1:
	type: <class 'numpy.void'>, len: 8
	type: <class 'numpy.ndarray'>, len: 62328
	type: <class 'numpy.ndarray'>, len: 1
Structure on depth 2:
	type: <class 'numpy.void'>, len: 8
	type: <class 'numpy.ndarray'>, len: 62328
	type: <class 'numpy.ndarray'>, len: 1
Structure on depth 3:
	type: <class 'numpy.void'>, len: 8
	type: <class 'numpy.ndarray'>, len: 62328
	type: <class 'numpy.ndarray'>, len: 1
Structure on depth 4:
	type: <class 'numpy.void'>, len: 8
	type: <class 'numpy.ndarray'>, len: 62328
	type: <class 'numpy.ndarray'>, len: 1


In [11]:
for i in range(8):
    print(f'Feature {i}:')
    print(f'\t type: {type(wiki_np_data[0][0][i][0][0])}')
    print(f'\t len: {type(wiki_np_data[0][0][i][0])}')
    print(f'\t sample: {wiki_np_data[0][0][i][0][0]}')

Feature 0:
	 type: <class 'numpy.int32'>
	 len: <class 'numpy.ndarray'>
	 sample: 723671
Feature 1:
	 type: <class 'numpy.uint16'>
	 len: <class 'numpy.ndarray'>
	 sample: 2009
Feature 2:
	 type: <class 'numpy.ndarray'>
	 len: <class 'numpy.ndarray'>
	 sample: ['17/10000217_1981-05-05_2009.jpg']
Feature 3:
	 type: <class 'numpy.float64'>
	 len: <class 'numpy.ndarray'>
	 sample: 1.0
Feature 4:
	 type: <class 'numpy.ndarray'>
	 len: <class 'numpy.ndarray'>
	 sample: ['Sami Jauhojärvi']
Feature 5:
	 type: <class 'numpy.ndarray'>
	 len: <class 'numpy.ndarray'>
	 sample: [[111.29109473 111.29109473 252.66993082 252.66993082]]
Feature 6:
	 type: <class 'numpy.float64'>
	 len: <class 'numpy.ndarray'>
	 sample: 4.3009623883308095
Feature 7:
	 type: <class 'numpy.float64'>
	 len: <class 'numpy.ndarray'>
	 sample: nan
