In [2]:
from pathlib import Path 

import numpy as np
import pandas as pd
import sqlalchemy as db
from tqdm.auto import tqdm
from utils import parse_filename, format_episode_name

In [3]:
username = 'amos'
password = 'M0$hicat'
host = '192.168.0.131'
port = '3306'
database = 'CineFace'

In [4]:
connection_string = f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'
engine = db.create_engine(connection_string)
conn = engine.connect()

In [5]:
encodings_dir = Path('./data/encodings')
subdirs = [x for x in encodings_dir.iterdir()]
data = []
for subdir in tqdm(subdirs, leave=True):
    files = [x for x in subdir.iterdir() if x.suffix == '.npy']
    for file in tqdm(files, leave=False):
        e = np.load(str(file)).round(8)
        row = parse_filename({'filepath': str(file)})
        datum = {'filepath': str(file),
                 'encoding': e}
        datum = {**datum, **{k:v for k,v in row.items() if k not in datum.keys()}}
        data.append(datum)
df = pd.DataFrame(data)
df.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/249535 [00:00<?, ?it/s]

  0%|          | 0/462487 [00:00<?, ?it/s]

Unnamed: 0,filepath,encoding,series_id,season,episode,frame_num,face_num
0,data/encodings/homeland_2011_796960/S05E04_390...,"[-0.11495647, 0.09172723, 0.04678859, -0.03902...",796960,5,4,39048,0
1,data/encodings/homeland_2011_796960/S06E05_389...,"[-0.02626337, 0.03598316, 0.01273727, -0.06370...",796960,6,5,38976,1
2,data/encodings/homeland_2011_796960/S01E01_429...,"[-0.20737375, 0.15791413, 0.0521762, -0.079184...",796960,1,1,4296,0
3,data/encodings/homeland_2011_796960/S03E07_562...,"[-0.07795181, 0.15547886, 0.15513685, -0.04811...",796960,3,7,56208,0
4,data/encodings/homeland_2011_796960/S07E04_111...,"[-0.12568267, 0.18408981, 0.01248703, -0.04678...",796960,7,4,11112,0


In [6]:
df['frame_num'] = df['frame_num'].astype(int)
df['face_num'] = df['face_num'].astype(int)

In [8]:
faces_df = pd.read_sql_query('SELECT * FROM faces_bk;', conn)
faces_df.head()

Unnamed: 0,series_id,episode_id,season,episode,frame_num,face_num,img_height,img_width,x1,y1,x2,y2,area,pct_of_frame,encoding
0,412142,606035,1,1,192,0,1080,1920,355,279,707,632,124256,0.06,
1,412142,606035,1,1,216,0,1080,1920,734,285,1027,578,85849,0.041,
2,412142,606035,1,1,216,1,1080,1920,71,154,213,296,20164,0.01,
3,412142,606035,1,1,216,2,1080,1920,355,216,525,385,28730,0.014,
4,412142,606035,1,1,240,0,1080,1920,640,208,992,560,123904,0.06,


In [9]:
combined = faces_df.drop('encoding', axis=1).merge(df[['series_id', 'season', 'episode', 'frame_num', 'face_num', 'filepath']],
                    how='left',
                    on=['series_id', 'season', 'episode', 'frame_num', 'face_num'])
combined.head()

Unnamed: 0,series_id,episode_id,season,episode,frame_num,face_num,img_height,img_width,x1,y1,x2,y2,area,pct_of_frame,filepath
0,412142,606035,1,1,192,0,1080,1920,355,279,707,632,124256,0.06,data/encodings/house_2004_0412142/S01E01_192_0...
1,412142,606035,1,1,216,0,1080,1920,734,285,1027,578,85849,0.041,data/encodings/house_2004_0412142/S01E01_216_0...
2,412142,606035,1,1,216,1,1080,1920,71,154,213,296,20164,0.01,data/encodings/house_2004_0412142/S01E01_216_1...
3,412142,606035,1,1,216,2,1080,1920,355,216,525,385,28730,0.014,data/encodings/house_2004_0412142/S01E01_216_2...
4,412142,606035,1,1,240,0,1080,1920,640,208,992,560,123904,0.06,data/encodings/house_2004_0412142/S01E01_240_0...


In [10]:
episode_df = pd.read_sql_query('SELECT episode_id, title, year FROM episodes;', conn)
episode_df.head()

Unnamed: 0,episode_id,title,year
0,536514,After the Ball Is Over,2003.0
1,536515,"Alamogordo, N.M.",2005.0
2,536516,Babylon,2003.0
3,536517,Black Blizzard,2003.0
4,536518,"Cheyenne, WY",2005.0


In [11]:
series_df = pd.read_sql_query('SELECT series_id, title, year FROM series;', conn)
series_df.head()

Unnamed: 0,series_id,title,year
0,412142,House,2004
1,1358522,White Collar,2009
2,1442437,Modern Family,2009
3,1632701,Suits,2011
4,1796960,Homeland,2011


In [12]:
def format_series_name(data,
                       id_col='imdbID'):
    title = data['title'].lower().replace(' ', '-')
    year = data['year']
    imdb_id = data[id_col]
    name = f'{title}_{int(year)}_{int(imdb_id)}'
    return name

In [14]:
combined = combined.rename({'filepath': 'encoding_path'}, axis=1)
eph = combined.assign(confidence=np.nan)
eph = eph.merge(series_df,
                how='left',
                on='series_id')
series_ids = eph['series_id'].unique()
dst_dir = Path('./data/faces_new/').absolute()
try:
    Path.mkdir(dst_dir)
except:
    pass
for series_id in tqdm(series_ids, leave=True):
    series = eph[eph['series_id'] == series_id]
    row = series.iloc[0]
    name = format_series_name(row, id_col='series_id')
    d = dst_dir.joinpath(name)
    try:
        Path.mkdir(d)
    except: 
        pass
    episode_ids = series['episode_id'].unique()
    for episode_id in tqdm(episode_ids, leave=False):
        temp = series[series['episode_id'] == episode_id]
        r = temp.iloc[0]
        fp = d.joinpath(f'{format_episode_name(r)}.csv')  
        temp = temp.drop(['title', 'year'], axis=1) 
        temp.to_csv(str(fp))     

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/176 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [145]:
eph.head()

Unnamed: 0,series_id,episode_id,season,episode,frame_num,face_num,img_height,img_width,x1,y1,x2,y2,area,pct_of_frame,encoding,confidence,title,year
0,412142,606035,1,1,192,0,1080,1920,355,279,707,632,124256,0.06,"[-0.10115475207567215, 0.11778414249420166, 0....",,Pilot,2004.0
1,412142,606035,1,1,216,0,1080,1920,734,285,1027,578,85849,0.041,"[-0.08282647281885147, 0.10644861310720444, 0....",,Pilot,2004.0
2,412142,606035,1,1,216,1,1080,1920,71,154,213,296,20164,0.01,"[-0.18955311179161072, 0.09923666715621948, 0....",,Pilot,2004.0
3,412142,606035,1,1,216,2,1080,1920,355,216,525,385,28730,0.014,"[-0.16009938716888428, 0.13064269721508026, 0....",,Pilot,2004.0
4,412142,606035,1,1,240,0,1080,1920,640,208,992,560,123904,0.06,"[-0.09131350368261337, -0.009009137749671936, ...",,Pilot,2004.0


In [105]:
base_dir = Path('./data/faces_new')
subdirs = [x for x in base_dir.iterdir()]
for subdir in subdirs:
    files = [x for x in subdir.iterdir()]
    for file in files:
        face_df = pd.read_csv(str(file), index_col=0)
        

PosixPath('/home/amos/programs/CineFace/data/faces_new/across-the-universe_2022_8879940')

In [3]:
temp = pd.read_csv('./data/faces_new/house_2004_412142/S01E02.csv', index_col=0)
temp.head()

Unnamed: 0,series_id,episode_id,season,episode,frame_num,face_num,img_height,img_width,x1,y1,x2,y2,area,pct_of_frame,encoding,confidence
2276,412142,606034,1,2,312,0,1080,1920,1146,183,1288,325,20164,0.01,[-0.12390631 0.07614616 0.11678329 -0.099798...,
2277,412142,606034,1,2,312,1,1080,1920,647,130,817,299,28730,0.014,[-4.27038521e-02 1.74362659e-01 -6.98575750e-...,
2278,412142,606034,1,2,312,2,1080,1920,1628,198,1797,368,28730,0.014,[-0.0387816 0.11706308 -0.03418145 -0.093032...,
2279,412142,606034,1,2,336,0,1080,1920,1180,233,1350,403,28900,0.014,[-8.56636316e-02 4.53053080e-02 1.19709015e-...,
2280,412142,606034,1,2,336,1,1080,1920,596,147,766,317,28900,0.014,[-3.18677612e-02 1.11289278e-01 -9.70213488e-...,


In [16]:
for idx, row in temp[:1].iterrows():
    encoding = np.fromstring(row['encoding'], dtype=np.uint8) \
        if not pd.isnull(row['encoding']) else np.nan
    


  encoding = np.fromstring(row['encoding'], dtype=np.uint8) \


In [6]:
row['encoding']

'[-0.12390631  0.07614616  0.11678329 -0.0997985  -0.08773655 -0.04309042\n  0.06076267 -0.14244014  0.14825246 -0.05276699  0.20215906 -0.0340088\n -0.26949546 -0.01933828 -0.0514079   0.12568706 -0.22719221 -0.19105817\n -0.09496498 -0.01995986  0.04524636  0.04760112  0.01201146  0.11922956\n -0.19961846 -0.20980765 -0.02525483 -0.12980998  0.06765667 -0.01994014\n  0.04463937  0.07590622 -0.18922977  0.01173784  0.06832031  0.09709262\n -0.04466827 -0.16053069  0.20023614 -0.02406869 -0.27431226 -0.09787033\n  0.02818858  0.21709751  0.24486476 -0.0192653  -0.01601221 -0.12815627\n  0.13381644 -0.32533035 -0.00484253  0.22556645  0.00439626  0.0992591\n  0.03565243 -0.19268301  0.11221638  0.1286478  -0.26914591 -0.00612033\n  0.06192154 -0.16558531  0.02380457 -0.07591818  0.11142398  0.02192428\n -0.16445932 -0.11394823  0.17248364 -0.21480589 -0.05593668  0.15134929\n -0.09576987 -0.22296187 -0.23300737 -0.02325273  0.4094032   0.22631662\n -0.14975776  0.07008357 -0.09239125 -0

In [64]:
t = re.sub(r'[^-.0-9\s]', '', row['encoding']).replace('  ', ' ')
t

'-0.12390631 0.07614616 0.11678329 -0.0997985 -0.08773655 -0.04309042\n 0.06076267 -0.14244014 0.14825246 -0.05276699 0.20215906 -0.0340088\n -0.26949546 -0.01933828 -0.0514079  0.12568706 -0.22719221 -0.19105817\n -0.09496498 -0.01995986 0.04524636 0.04760112 0.01201146 0.11922956\n -0.19961846 -0.20980765 -0.02525483 -0.12980998 0.06765667 -0.01994014\n 0.04463937 0.07590622 -0.18922977 0.01173784 0.06832031 0.09709262\n -0.04466827 -0.16053069 0.20023614 -0.02406869 -0.27431226 -0.09787033\n 0.02818858 0.21709751 0.24486476 -0.0192653 -0.01601221 -0.12815627\n 0.13381644 -0.32533035 -0.00484253 0.22556645 0.00439626 0.0992591\n 0.03565243 -0.19268301 0.11221638 0.1286478 -0.26914591 -0.00612033\n 0.06192154 -0.16558531 0.02380457 -0.07591818 0.11142398 0.02192428\n -0.16445932 -0.11394823 0.17248364 -0.21480589 -0.05593668 0.15134929\n -0.09576987 -0.22296187 -0.23300737 -0.02325273 0.4094032  0.22631662\n -0.14975776 0.07008357 -0.09239125 -0.00525293 0.06747864 0.15845674\n 0.0111

In [65]:
a = np.fromstring(re.sub(r'[^-.0-9\s]', '', row['encoding']))

  a = np.fromstring(re.sub(r'[^-.0-9\s]', '', row['encoding']))


ValueError: string size must be a multiple of element size

In [28]:
a

array([], dtype=float64)

In [61]:
import re 

re.sub(r'[^-.0-9\s]', '', row['encoding']).replace('  ', ' ')

'-0.12390631 0.07614616 0.11678329 -0.0997985 -0.08773655 -0.04309042\n 0.06076267 -0.14244014 0.14825246 -0.05276699 0.20215906 -0.0340088\n -0.26949546 -0.01933828 -0.0514079  0.12568706 -0.22719221 -0.19105817\n -0.09496498 -0.01995986 0.04524636 0.04760112 0.01201146 0.11922956\n -0.19961846 -0.20980765 -0.02525483 -0.12980998 0.06765667 -0.01994014\n 0.04463937 0.07590622 -0.18922977 0.01173784 0.06832031 0.09709262\n -0.04466827 -0.16053069 0.20023614 -0.02406869 -0.27431226 -0.09787033\n 0.02818858 0.21709751 0.24486476 -0.0192653 -0.01601221 -0.12815627\n 0.13381644 -0.32533035 -0.00484253 0.22556645 0.00439626 0.0992591\n 0.03565243 -0.19268301 0.11221638 0.1286478 -0.26914591 -0.00612033\n 0.06192154 -0.16558531 0.02380457 -0.07591818 0.11142398 0.02192428\n -0.16445932 -0.11394823 0.17248364 -0.21480589 -0.05593668 0.15134929\n -0.09576987 -0.22296187 -0.23300737 -0.02325273 0.4094032  0.22631662\n -0.14975776 0.07008357 -0.09239125 -0.00525293 0.06747864 0.15845674\n 0.0111

In [48]:
row['encoding']

'[-0.12390631  0.07614616  0.11678329 -0.0997985  -0.08773655 -0.04309042\n  0.06076267 -0.14244014  0.14825246 -0.05276699  0.20215906 -0.0340088\n -0.26949546 -0.01933828 -0.0514079   0.12568706 -0.22719221 -0.19105817\n -0.09496498 -0.01995986  0.04524636  0.04760112  0.01201146  0.11922956\n -0.19961846 -0.20980765 -0.02525483 -0.12980998  0.06765667 -0.01994014\n  0.04463937  0.07590622 -0.18922977  0.01173784  0.06832031  0.09709262\n -0.04466827 -0.16053069  0.20023614 -0.02406869 -0.27431226 -0.09787033\n  0.02818858  0.21709751  0.24486476 -0.0192653  -0.01601221 -0.12815627\n  0.13381644 -0.32533035 -0.00484253  0.22556645  0.00439626  0.0992591\n  0.03565243 -0.19268301  0.11221638  0.1286478  -0.26914591 -0.00612033\n  0.06192154 -0.16558531  0.02380457 -0.07591818  0.11142398  0.02192428\n -0.16445932 -0.11394823  0.17248364 -0.21480589 -0.05593668  0.15134929\n -0.09576987 -0.22296187 -0.23300737 -0.02325273  0.4094032   0.22631662\n -0.14975776  0.07008357 -0.09239125 -0

In [67]:
temp = pd.read_csv(str(file), index_col=0)
e = temp.iloc['encoding']
np.array2string(e)

NameError: name 'file' is not defined

In [4]:
np.fromstring('[-0.143 .0456 .0332]')

  np.fromstring('[-0.143 .0456 .0332]')


ValueError: string size must be a multiple of element size