In [122]:
import ast
import json
import shutil
from pathlib import Path 
from functools import partial

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [16]:
df = pd.read_csv('./data/faces.csv', index_col=0)
df.head()

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,filename,character,cast_id
0,1811020,192,0,798,196,916,314,1080,1920,13924,0.007,1,1,"[-0.1581144630908966, 0.11875572055578232, 0.0...",,,
1,1811020,216,0,1068,389,1491,812,1080,1920,178929,0.086,1,1,"[-0.035924218595027924, 0.09460100531578064, 0...",,,
2,1811020,216,1,912,17,1205,311,1080,1920,86142,0.042,1,1,"[-0.10312943160533905, 0.1261938065290451, 0.0...",,,
3,1811020,240,0,1090,106,1384,400,1080,1920,86436,0.042,1,1,"[-0.18763160705566406, 0.08110883831977844, 0....",,,
4,1811020,240,1,674,522,968,816,1080,1920,86436,0.042,1,1,"[-0.11751651763916016, 0.1057925745844841, 0.0...",,,


In [17]:
class_df = df[df['character'].notna()]
class_df.head()

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,filename,character,cast_id
7,1811020,384,0,497,208,849,560,1080,1920,123904,0.06,1,1,"[-0.09047424793243408, 0.08664017170667648, 0....",S01E01_384_0.png,Carrie Mathison,132.0
8,1811020,408,0,390,208,742,560,1080,1920,123904,0.06,1,1,"[-0.15323874354362488, 0.10863395780324936, 0....",S01E01_408_0.png,Carrie Mathison,132.0
9,1811020,432,0,198,99,807,707,1080,1920,370272,0.179,1,1,"[-0.1460844725370407, 0.09071134030818939, 0.0...",S01E01_432_0.png,Carrie Mathison,132.0
10,1811020,456,0,445,160,1053,769,1080,1920,370272,0.179,1,1,"[-0.11816225200891495, 0.12249065190553665, 0....",S01E01_456_0.png,Carrie Mathison,132.0
11,1811020,480,0,260,160,868,769,1080,1920,370272,0.179,1,1,"[-0.09060899913311005, 0.09976912289857864, 0....",S01E01_480_0.png,Carrie Mathison,132.0


In [19]:
classes = {x: num for num, x in enumerate(list(set(class_df['character'])))}
classes

{'Nicholas Brody': 0,
 'Numan': 1,
 'Etai Luskin': 2,
 'Sandy Langmore': 3,
 'Jonas': 4,
 'Clint Prower': 5,
 'Jim Lippard': 6,
 'Jalal Haqqani': 7,
 'Lynne Reed': 8,
 'Dar Adal': 9,
 'Jessica Brody': 10,
 'Ray Conlin': 11,
 'Otto During': 12,
 'Reda Hashem': 13,
 "Abdul Qadir G'ulom": 14,
 'Paksima': 15,
 'Aayan Ibrahim': 16,
 'Haissam Haqqani': 17,
 'Ellen Mathison': 18,
 'Max Piotrowski': 19,
 'Senator Sam Paley': 20,
 'Dante Allen': 21,
 'Peter Quinn': 22,
 'John Zabel': 23,
 'Ivan Krupin': 24,
 'Dana Brody': 25,
 'Simone Martin': 26,
 'President Ben Hayes': 27,
 'Saul Berenson': 28,
 'Carrie Mathison': 29,
 'Thomas Anson': 30,
 'John Redmond': 31,
 'Franny': 32,
 'Christine Lonas': 33,
 'David Wellington': 34,
 'Chris Brody': 35,
 "Brett O'Keefe": 36,
 'Mike Dunn': 37,
 'Tom Walker': 38,
 'Fara Sherazi': 39,
 'Frank Mathison': 40,
 'Issa Nazir': 41,
 'Martha Boyd': 42,
 'President Elizabeth Keane': 43,
 'Maggie Mathison': 44,
 'Sekou Bah': 45,
 'Majid Javadi': 46,
 'Helen Walker':

In [28]:
X = np.array([np.array(ast.literal_eval(x)) for x in class_df['encoding'].tolist()])
y = np.array([classes[x] for x in class_df['character']])
print(X.shape)
print(y.shape)

(126839, 128)
(126839,)


In [29]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print('x_train: ', x_train.shape)
print('x_test: ', x_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

x_train:  (101471, 128)
x_test:  (25368, 128)
y_train:  (101471,)
y_test:  (25368,)


In [30]:
neigh = KNeighborsClassifier()
neigh.fit(x_train, y_train)

In [31]:
pred = neigh.predict(x_test)
pred

array([28, 25, 59, ..., 63, 29, 43])

In [35]:
correct = 0
for x in range(pred.shape[0]):
    if pred[x] == y_test[x]:
        correct += 1
print(correct, y_test.shape[0], (correct/y_test.shape[0]))

25140 25368 0.9910122989593189


In [71]:
def process(row,
            classifier):
    if pd.isnull(row['character']):
        a = ast.literal_eval(row['encoding'])
        a = np.array(a)
        pred = classifier.predict(a.reshape(1, -1))
        return pred
    else:
        return np.nan

In [66]:
f = partial(process, classifier=neigh)

In [72]:
tqdm().pandas()
p = df.progress_apply(f, axis=1)
p

0it [00:00, ?it/s]

  0%|          | 0/249536 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [82]:
X_pred = np.array([ast.literal_eval(x) for x in tqdm(df['encoding'], total=df.shape[0])])

  0%|          | 0/249536 [00:00<?, ?it/s]

In [83]:
y_pred = neigh.predict(X_pred)
y_pred

array([17,  9, 19, ..., 65, 45, 28])

In [85]:
num_to_class = {v: k for k, v in classes.items()}
df['pred'] = [num_to_class[x] for x in y_pred]
df.head()

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,filename,character,cast_id,pred
0,1811020,192,0,798,196,916,314,1080,1920,13924,0.007,1,1,"[-0.1581144630908966, 0.11875572055578232, 0.0...",,,,Haissam Haqqani
1,1811020,216,0,1068,389,1491,812,1080,1920,178929,0.086,1,1,"[-0.035924218595027924, 0.09460100531578064, 0...",,,,Dar Adal
2,1811020,216,1,912,17,1205,311,1080,1920,86142,0.042,1,1,"[-0.10312943160533905, 0.1261938065290451, 0.0...",,,,Max Piotrowski
3,1811020,240,0,1090,106,1384,400,1080,1920,86436,0.042,1,1,"[-0.18763160705566406, 0.08110883831977844, 0....",,,,Aayan Ibrahim
4,1811020,240,1,674,522,968,816,1080,1920,86436,0.042,1,1,"[-0.11751651763916016, 0.1057925745844841, 0.0...",,,,Etai Luskin


In [92]:
df.to_csv(r'./data/predictions.csv')

In [99]:
def format_filepaths(row):
    e = str(row['episode'])
    s = str(row['season'])
    frame_num = str(row['frame_num'])
    face_num = str(row['face_num'])
    name = f'S{s.zfill(2)}E{e.zfill(2)}_{frame_num}_{face_num}.png'
    return name

In [100]:
names = df.apply(format_filepaths, axis=1)

In [101]:
temp = df.copy()
temp['fp'] = names
temp.head()

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,filename,character,cast_id,pred,fp
0,1811020,192,0,798,196,916,314,1080,1920,13924,0.007,1,1,"[-0.1581144630908966, 0.11875572055578232, 0.0...",,,,Haissam Haqqani,S01E01_192_0.png
1,1811020,216,0,1068,389,1491,812,1080,1920,178929,0.086,1,1,"[-0.035924218595027924, 0.09460100531578064, 0...",,,,Dar Adal,S01E01_216_0.png
2,1811020,216,1,912,17,1205,311,1080,1920,86142,0.042,1,1,"[-0.10312943160533905, 0.1261938065290451, 0.0...",,,,Max Piotrowski,S01E01_216_1.png
3,1811020,240,0,1090,106,1384,400,1080,1920,86436,0.042,1,1,"[-0.18763160705566406, 0.08110883831977844, 0....",,,,Aayan Ibrahim,S01E01_240_0.png
4,1811020,240,1,674,522,968,816,1080,1920,86436,0.042,1,1,"[-0.11751651763916016, 0.1057925745844841, 0.0...",,,,Etai Luskin,S01E01_240_1.png


In [104]:
df['filename'] = names
df

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,filename,character,cast_id,pred
0,1811020,192,0,798,196,916,314,1080,1920,13924,0.007,1,1,"[-0.1581144630908966, 0.11875572055578232, 0.0...",S01E01_192_0.png,,,Haissam Haqqani
1,1811020,216,0,1068,389,1491,812,1080,1920,178929,0.086,1,1,"[-0.035924218595027924, 0.09460100531578064, 0...",S01E01_216_0.png,,,Dar Adal
2,1811020,216,1,912,17,1205,311,1080,1920,86142,0.042,1,1,"[-0.10312943160533905, 0.1261938065290451, 0.0...",S01E01_216_1.png,,,Max Piotrowski
3,1811020,240,0,1090,106,1384,400,1080,1920,86436,0.042,1,1,"[-0.18763160705566406, 0.08110883831977844, 0....",S01E01_240_0.png,,,Aayan Ibrahim
4,1811020,240,1,674,522,968,816,1080,1920,86436,0.042,1,1,"[-0.11751651763916016, 0.1057925745844841, 0.0...",S01E01_240_1.png,,,Etai Luskin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249531,7126670,95544,1,880,280,1125,525,1080,1920,60025,0.029,8,12,"[-0.2143310308456421, 0.11155920475721359, 0.2...",S08E12_95544_1.png,,,Sekou Bah
249532,7126670,95544,2,1209,403,1503,697,1080,1920,86436,0.042,8,12,"[-0.07177435606718063, 0.16331857442855835, 0....",S08E12_95544_2.png,,,Saul Berenson
249533,7126670,95568,0,360,280,605,525,1080,1920,60025,0.029,8,12,"[-0.07891353964805603, 0.03790706396102905, 0....",S08E12_95568_0.png,,,Andrew Lockhart
249534,7126670,95568,1,880,280,1125,525,1080,1920,60025,0.029,8,12,"[-0.21665415167808533, 0.11300860345363617, 0....",S08E12_95568_1.png,,,Sekou Bah


In [105]:
temp_df = df[df['character'].isna()]
temp_df

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,filename,character,cast_id,pred
0,1811020,192,0,798,196,916,314,1080,1920,13924,0.007,1,1,"[-0.1581144630908966, 0.11875572055578232, 0.0...",S01E01_192_0.png,,,Haissam Haqqani
1,1811020,216,0,1068,389,1491,812,1080,1920,178929,0.086,1,1,"[-0.035924218595027924, 0.09460100531578064, 0...",S01E01_216_0.png,,,Dar Adal
2,1811020,216,1,912,17,1205,311,1080,1920,86142,0.042,1,1,"[-0.10312943160533905, 0.1261938065290451, 0.0...",S01E01_216_1.png,,,Max Piotrowski
3,1811020,240,0,1090,106,1384,400,1080,1920,86436,0.042,1,1,"[-0.18763160705566406, 0.08110883831977844, 0....",S01E01_240_0.png,,,Aayan Ibrahim
4,1811020,240,1,674,522,968,816,1080,1920,86436,0.042,1,1,"[-0.11751651763916016, 0.1057925745844841, 0.0...",S01E01_240_1.png,,,Etai Luskin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249531,7126670,95544,1,880,280,1125,525,1080,1920,60025,0.029,8,12,"[-0.2143310308456421, 0.11155920475721359, 0.2...",S08E12_95544_1.png,,,Sekou Bah
249532,7126670,95544,2,1209,403,1503,697,1080,1920,86436,0.042,8,12,"[-0.07177435606718063, 0.16331857442855835, 0....",S08E12_95544_2.png,,,Saul Berenson
249533,7126670,95568,0,360,280,605,525,1080,1920,60025,0.029,8,12,"[-0.07891353964805603, 0.03790706396102905, 0....",S08E12_95568_0.png,,,Andrew Lockhart
249534,7126670,95568,1,880,280,1125,525,1080,1920,60025,0.029,8,12,"[-0.21665415167808533, 0.11300860345363617, 0....",S08E12_95568_1.png,,,Sekou Bah


In [129]:
def save_images(row,
                dst):
    dst_dir = dst.joinpath(row['pred'].replace(' ', '_'))
    if not dst_dir.exists():
        Path.mkdir(dst_dir, parents=True)
    fp = dst_dir.joinpath(row['filename']).absolute()
    src = Path('./data/images/').joinpath(row['filename']).absolute()
    shutil.copy(str(src), str(fp))

In [130]:
dst = Path('./data/predictions')
f = partial(save_images, dst=dst)
tqdm().pandas()
df.progress_apply(f, axis=1)

0it [00:00, ?it/s]

  0%|          | 0/249536 [00:00<?, ?it/s]

0         None
1         None
2         None
3         None
4         None
          ... 
249531    None
249532    None
249533    None
249534    None
249535    None
Length: 249536, dtype: object