In [85]:
import ast
import shutil
from pathlib import Path 

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [69]:
training_dir = Path('./data/train')
class_dirs = list(sorted([x for x in training_dir.iterdir()]))
classes = {x.parts[-1]: num for num, x in enumerate(class_dirs)}
num_to_class = {v: k for k, v in classes.items()}
classes

{'brody': 0,
 'brody-son': 1,
 'carrie': 2,
 'dana': 3,
 'estes': 4,
 'max': 5,
 'mike': 6,
 'mrs-brody': 7,
 'mrs-walker': 8,
 'saul': 9,
 'virgil': 10,
 'walden': 11}

In [22]:
face_df = pd.read_csv('./data/faces.csv', index_col=0)
face_df.head()

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding
0,1811020,192,0,798,196,916,314,1080,1920,13924,0.007,1,1,"[-0.1581144630908966, 0.11875572055578232, 0.0..."
1,1811020,216,0,1068,389,1491,812,1080,1920,178929,0.086,1,1,"[-0.035924218595027924, 0.09460100531578064, 0..."
2,1811020,216,1,912,17,1205,311,1080,1920,86142,0.042,1,1,"[-0.10312943160533905, 0.1261938065290451, 0.0..."
3,1811020,240,0,1090,106,1384,400,1080,1920,86436,0.042,1,1,"[-0.18763160705566406, 0.08110883831977844, 0...."
4,1811020,240,1,674,522,968,816,1080,1920,86436,0.042,1,1,"[-0.11751651763916016, 0.1057925745844841, 0.0..."


In [23]:
files = []
data = []
for subdir in class_dirs:
    files = [x for x in subdir.iterdir()]
    for file in files:
        temp, frame_num, face_num = file.stem.split('_')
        s = int(temp[1:3])
        e = int(temp[4:6])
        datum = {'fp': str(file.absolute().resolve()),
                 'season': s,
                 'episode': e,
                 'frame_num': int(frame_num),
                 'face_num': int(face_num),
                 'class': classes[subdir.parts[-1]]
            }
        data.append(datum)
class_df = pd.DataFrame(data)
class_df.head()

Unnamed: 0,fp,season,episode,frame_num,face_num,class
0,/home/amos/programs/FacesOfHomeland/data/train...,1,1,54120,0,0
1,/home/amos/programs/FacesOfHomeland/data/train...,1,1,48840,0,0
2,/home/amos/programs/FacesOfHomeland/data/train...,1,1,49896,0,0
3,/home/amos/programs/FacesOfHomeland/data/train...,1,1,46776,0,0
4,/home/amos/programs/FacesOfHomeland/data/train...,1,1,27192,0,0


In [30]:
df = face_df.merge(class_df,
              on=['season', 'episode', 'frame_num', 'face_num'],
              how='inner'
             )
df.head()

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,fp,class
0,1811020,264,1,775,403,856,485,1080,1920,6642,0.003,1,1,"[-0.04412677139043808, 0.1082349494099617, 0.1...",/home/amos/programs/FacesOfHomeland/data/train...,2
1,1811020,384,0,497,208,849,560,1080,1920,123904,0.06,1,1,"[-0.09047424793243408, 0.08664017170667648, 0....",/home/amos/programs/FacesOfHomeland/data/train...,2
2,1811020,408,0,390,208,742,560,1080,1920,123904,0.06,1,1,"[-0.15323874354362488, 0.10863395780324936, 0....",/home/amos/programs/FacesOfHomeland/data/train...,2
3,1811020,432,0,198,99,807,707,1080,1920,370272,0.179,1,1,"[-0.1460844725370407, 0.09071134030818939, 0.0...",/home/amos/programs/FacesOfHomeland/data/train...,2
4,1811020,456,0,445,160,1053,769,1080,1920,370272,0.179,1,1,"[-0.11816225200891495, 0.12249065190553665, 0....",/home/amos/programs/FacesOfHomeland/data/train...,2


In [38]:
X = np.array([np.array(ast.literal_eval(x)) for x in df['encoding'].tolist()])
y = df['class']
print(X.shape)
print(y.shape)

(2040, 128)
(2040,)


In [41]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
print('x_train: ', x_train.shape)
print('x_test: ', x_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

x_train:  (1632, 128)
x_test:  (408, 128)
y_train:  (1632,)
y_test:  (408,)


In [1]:
neigh = KNeighborsClassifier()
neigh.fit(x_train, y_train)

NameError: name 'KNeighborsClassifier' is not defined

In [44]:
pred = neigh.predict(x_test)
pred

array([ 2,  6,  0,  2,  7,  8,  2,  7,  2,  2,  4,  2,  4,  9,  7,  4, 10,
        7,  1,  2,  2,  2,  2,  2,  2,  2,  9,  0,  0,  2,  0,  3,  2,  7,
        0,  2,  2,  9,  2,  0,  9,  3,  2,  0,  0, 10,  2,  3,  1,  3,  1,
        8,  2,  9,  5,  7,  2,  2,  6,  4,  7,  2,  2,  7,  7,  2,  2,  2,
        3,  0,  0,  3,  6,  0,  7,  2,  2,  7,  9,  9,  1,  2,  9,  0,  2,
        2,  0,  2,  7,  9,  3,  0,  2,  9,  0, 10,  2,  0,  9,  0,  0,  2,
        2, 10,  3,  2,  6,  3,  2,  3,  2,  0,  5,  2,  0,  4,  0,  0,  2,
        2, 11,  0,  7,  2,  2,  3,  4,  2,  3,  3,  6,  4,  8,  3,  1,  0,
        4,  2,  0,  2,  9,  6,  2,  3,  3,  2,  2, 10,  1,  9,  9,  7,  7,
        2,  2,  2,  9,  2,  2,  0,  1,  4,  2,  1,  0,  4,  2,  2,  4,  0,
        8,  2,  2,  2,  2,  2,  3,  0,  0,  2,  2,  2,  2,  2,  2,  2,  0,
        9,  9,  2,  2,  3,  2, 10,  2,  2,  9,  9,  2,  3,  7,  7,  2,  3,
        3,  2,  9,  9,  0, 10,  6,  7,  4,  7, 10,  0,  0,  9,  2,  0,  3,
        3,  0,  2,  0,  7

In [59]:
correct = 0
for x in range(pred.shape[0]):
    if pred[x] == y_test.values[x]:
        correct += 1
print(correct, y_test.shape[0])

408 408


In [64]:
episode_df = face_df[(face_df['season'] == 1) & (face_df['episode'] == 2)]
episode_df.head()

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding
0,1988308,72,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.12935210764408112, 0.09023398905992508, 0...."
1,1988308,96,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.15715254843235016, 0.05274084210395813, 0...."
2,1988308,120,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.12813544273376465, 0.02122906967997551, 0...."
3,1988308,144,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.1354607343673706, 0.04214496165513992, 0.0..."
4,1988308,168,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.14760874211788177, 0.03259202465415001, 0...."


In [65]:
X = np.array([np.array(ast.literal_eval(x)) for x in episode_df['encoding'].tolist()])

In [66]:
new_pred = neigh.predict(X)
new_pred

array([7, 3, 2, ..., 2, 2, 2])

In [71]:
episode_df['pred'] = [num_to_class[x] for x in new_pred]
episode_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  episode_df['pred'] = [num_to_class[x] for x in new_pred]


Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,pred
0,1988308,72,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.12935210764408112, 0.09023398905992508, 0....",mrs-brody
1,1988308,96,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.15715254843235016, 0.05274084210395813, 0....",dana
2,1988308,120,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.12813544273376465, 0.02122906967997551, 0....",carrie
3,1988308,144,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.1354607343673706, 0.04214496165513992, 0.0...",carrie
4,1988308,168,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.14760874211788177, 0.03259202465415001, 0....",carrie


In [79]:
data = []
for file in tqdm([x for x in Path('./data/images').iterdir()]):
    temp, frame_num, face_num = file.stem.split('_')
    s = int(temp[1:3])
    e = int(temp[4:6])
    datum = {'fp': str(file.absolute().resolve()),
             'season': s,
             'episode': e,
             'frame_num': int(frame_num),
             'face_num': int(face_num)
            }
    data.append(datum)
fp_df = pd.DataFrame(data)
fp_df

100%|██████████████████████████████| 249535/249535 [00:08<00:00, 30744.46it/s]


Unnamed: 0,fp,season,episode,frame_num,face_num
0,/home/amos/programs/FacesOfHomeland/data/image...,2,4,54096,0
1,/home/amos/programs/FacesOfHomeland/data/image...,7,1,74904,0
2,/home/amos/programs/FacesOfHomeland/data/image...,3,6,60000,0
3,/home/amos/programs/FacesOfHomeland/data/image...,7,2,74520,0
4,/home/amos/programs/FacesOfHomeland/data/image...,8,4,44112,1
...,...,...,...,...,...
249530,/home/amos/programs/FacesOfHomeland/data/image...,2,5,45456,0
249531,/home/amos/programs/FacesOfHomeland/data/image...,2,4,21360,1
249532,/home/amos/programs/FacesOfHomeland/data/image...,2,8,68184,0
249533,/home/amos/programs/FacesOfHomeland/data/image...,8,5,11304,0


In [80]:
episode_df = episode_df.merge(fp_df,
                              on=['season', 'episode', 'frame_num', 'face_num'],
                              how='inner'
                             )
episode_df

Unnamed: 0,imdb_id,frame_num,face_num,x1,y1,x2,y2,img_height,img_width,area,pct_of_frame,season,episode,encoding,pred,fp
0,1988308,72,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.12935210764408112, 0.09023398905992508, 0....",mrs-brody,/home/amos/programs/FacesOfHomeland/data/image...
1,1988308,96,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.15715254843235016, 0.05274084210395813, 0....",dana,/home/amos/programs/FacesOfHomeland/data/image...
2,1988308,120,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.12813544273376465, 0.02122906967997551, 0....",carrie,/home/amos/programs/FacesOfHomeland/data/image...
3,1988308,144,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.1354607343673706, 0.04214496165513992, 0.0...",carrie,/home/amos/programs/FacesOfHomeland/data/image...
4,1988308,168,0,850,52,1902,1080,1080,1920,1081456,0.522,1,2,"[-0.14760874211788177, 0.03259202465415001, 0....",carrie,/home/amos/programs/FacesOfHomeland/data/image...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2483,1988308,71568,0,597,175,1020,598,1080,1920,178929,0.086,1,2,"[-0.0958666205406189, 0.07539680600166321, 0.0...",carrie,/home/amos/programs/FacesOfHomeland/data/image...
2484,1988308,71592,0,640,208,992,560,1080,1920,123904,0.060,1,2,"[-0.11947602778673172, 0.09725409746170044, 0....",carrie,/home/amos/programs/FacesOfHomeland/data/image...
2485,1988308,71616,0,512,132,934,555,1080,1920,178506,0.086,1,2,"[-0.11342617869377136, 0.10334137827157974, 0....",carrie,/home/amos/programs/FacesOfHomeland/data/image...
2486,1988308,71640,0,426,175,849,598,1080,1920,178929,0.086,1,2,"[-0.09269304573535919, 0.08229314535856247, 0....",carrie,/home/amos/programs/FacesOfHomeland/data/image...


In [86]:
dst = Path('./data/results')
if not dst.exists():
    Path.mkdir(dst)
for idx, row in episode_df.iterrows():
    class_dir = dst.joinpath(row['pred'])
    if not class_dir.exists():
        Path.mkdir(class_dir)
    season = str(row['season'])
    episode = str(row['episode'])
    frame_num = row['frame_num']
    face_num = row['face_num']
    name = f'S{season.zfill(2)}E{episode.zfill(2)}_{frame_num}_{face_num}.png'
    fp = class_dir.joinpath(name)
    shutil.copy(row['fp'], fp)    