In [74]:
import re
import time 
from pathlib import Path 

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from utils import episode_from_name, parse_filename, format_episode_name

In [2]:
%load_ext autoreload
%autoreload 2

In [16]:
def get_encodings():
    encodings_dir = Path('./data/encodings')
    subdirs = [x for x in encodings_dir.iterdir()]
    data = []
    for subdir in tqdm(subdirs, leave=True):
        files = [x for x in subdir.iterdir() if x.suffix == '.npy']
        for file in tqdm(files, leave=False):
            e = np.load(str(file))
            row = parse_filename({'filepath': str(file)})
            datum = {'filepath': str(file),
                    'encoding': e}
            datum = {**datum, **{k:v for k,v in row.items() if k not in datum.keys()}}
            data.append(datum)
    df = pd.DataFrame(data)
    return df 

In [17]:
encoding_df = get_encodings()
encoding_df.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/249535 [00:00<?, ?it/s]

  0%|          | 0/462487 [00:00<?, ?it/s]

Unnamed: 0,filepath,encoding,series_id,season,episode,frame_num,face_num
0,data/encodings/homeland_2011_796960/S05E04_390...,"[-0.11495646834373474, 0.09172723442316055, 0....",796960,5,4,39048,0
1,data/encodings/homeland_2011_796960/S06E05_389...,"[-0.026263369247317314, 0.03598316013813019, 0...",796960,6,5,38976,1
2,data/encodings/homeland_2011_796960/S01E01_429...,"[-0.2073737531900406, 0.15791413187980652, 0.0...",796960,1,1,4296,0
3,data/encodings/homeland_2011_796960/S03E07_562...,"[-0.0779518112540245, 0.15547886490821838, 0.1...",796960,3,7,56208,0
4,data/encodings/homeland_2011_796960/S07E04_111...,"[-0.12568266689777374, 0.1840898096561432, 0.0...",796960,7,4,11112,0


In [26]:
encoding_df['frame_num'] = encoding_df['frame_num'].astype(int)
encoding_df['face_num'] = encoding_df['face_num'].astype(int)

In [84]:
def get_faces(src='./data/faces'):
    d = Path(src)
    subdirs = sorted([x for x in d.iterdir()])
    paths = []
    for subdir in tqdm(subdirs, leave=True):
        files = [x for x in subdir.iterdir()]
        paths.extend(files)
    return paths 

In [68]:
def check_empty(src):
    df = pd.read_csv(src, index_col=0, header=0, nrows=5, engine='c')
    if df.empty:
        return False 
    else:
        return True

In [82]:
paths = get_faces()
dst_dir = Path('./data/faces_test')
try:
    Path.mkdir(dst_dir)
except:
    pass
for path in tqdm(paths):
    # t = time.time()
    series_id = int(Path(path).parent.parts[-1].split('_')[-1])
    try:
        episode = re.search(r'S[0-9]{2}E[0-9]{2}', str(path.stem), flags=re.I).group(0)
    except AttributeError:
        print(str(path))
        break 
    # tqdm.write(str(time.time() - t))
    
    s, e = episode_from_name(episode)
    dst = Path(dst_dir).joinpath(str(series_id))
    name = format_episode_name({'season': s, 'episode': e})
    fp = dst.joinpath(f'{name}.csv')
    if fp.exists():
        continue 

    try: 
        Path.mkdir(dst)
    except: 
        pass
    
    # t = time.time()
    if not check_empty(str(path)):
        print(str(path))
        path.unlink()
        continue
    # tqdm.write(str(time.time() - t))

    # t = time.time()
    temp = pd.read_csv(str(path),
                    engine='c',
                    index_col=0)
    # tqdm.write(str(time.time() - t))
    # except pd.exce
    #     print(str(path))
    #     path.unlink()
    #     continue
    
    # t = time.time()
    if 'encoding' in temp.columns:
        temp = temp.drop('encoding', axis=1)
    elif 'embedding' in temp.columns:
        temp = temp.drop('embedding')
    temp = temp.assign(season=s, episode=e, series_id=series_id)
    temp = temp.merge(encoding_df.drop('encoding', axis=1),
                      how='left',
                      on=['series_id', 'season', 'episode', 'frame_num', 'face_num'])
    temp = temp.rename({'filepath': 'encoding_path'}, axis=1)
    name = format_episode_name(temp.iloc[0])
    # tqdm.write(str(time.time() - t))
    
    # t = time.time()
    temp.to_csv(str(fp))
    # tqdm.write(str(time.time() - t))

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/1389 [00:00<?, ?it/s]

data/faces/lost-girl_2010_1429449/Lost.Girl.S03E05.Faes.Wide.Shut.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S05E06.Clear.Eyes.Fae.Hearts.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S01E07.ArachnoFaebia.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S02E13.Barometz.Trick.Pressure.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S04E09.Destinys.Child.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S02E03.Scream.a.Little.Dream.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S01E02.Where.Theres.a.Will.Theres.a.Fae.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S03E11.Adventures.in.Fae-bysitting.BluRay.10Bit.1080p.DD5.1.H265-d3g.csv
data/faces/lost-girl_2010_1429449/Lost.Girl.S02E17.The.Girl.Who.Faed.With.Fire.BluRay.10Bit.1080p.DD5.1.H265-d3g.cs

In [29]:
path = paths[0]
print(path)
df = pd.read_csv(str(path), index_col=0)
df.head()

data/faces/a-murder-at-the-end-of-the-world_2023_15227418/A.Murder.at.the.End.of.the.World.S01E05.Crypt.2160p.HULU.WEB-DL.DDP5.1.HEVC-CMRG.csv


Unnamed: 0,img_height,img_width,face_num,x1,x2,y1,y2,area,pct_of_frame,embedding,confidence,frame_num,video_src
0,607,1080,0,428,83,338,428,31050,0.047,[[[0.5058824 0.49019608 0.47058824]\n [0.501...,5.363363,504,/home/amos/media/tv/a_murder_at_the_end_of_the...
1,607,1080,0,460,70,326,460,52260,0.08,[[[0.38039216 0.3882353 0.42745098]\n [0.384...,8.964681,528,/home/amos/media/tv/a_murder_at_the_end_of_the...
2,607,1080,0,144,179,643,144,17465,0.027,[[[0.29803923 0.32156864 0.3254902 ]\n [0.250...,7.455532,576,/home/amos/media/tv/a_murder_at_the_end_of_the...
3,607,1080,0,169,122,588,169,19693,0.03,[[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]\n .....,7.531884,624,/home/amos/media/tv/a_murder_at_the_end_of_the...
4,607,1080,1,131,283,323,131,29184,0.045,[[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]\n .....,8.216904,624,/home/amos/media/tv/a_murder_at_the_end_of_the...


In [30]:
episode = re.search(r'S[0-9]{2}E[0-9]{2}', str(paths[0])).group(0)
episode

'S01E05'

In [34]:
series_id = int(Path(path).parent.parts[-1].split('_')[-1])
series_id

15227418

In [35]:
s, e = episode_from_name(episode)
df = df.assign(season=s, episode=e, series_id=series_id)


In [36]:
temp = df.merge(encoding_df,
         how='left',
         on=['series_id', 'season', 'episode', 'frame_num', 'face_num'])
temp.head()

Unnamed: 0,img_height,img_width,face_num,x1,x2,y1,y2,area,pct_of_frame,embedding,confidence,frame_num,video_src,season,episode,series_id,filepath,encoding
0,607,1080,0,428,83,338,428,31050,0.047,[[[0.5058824 0.49019608 0.47058824]\n [0.501...,5.363363,504,/home/amos/media/tv/a_murder_at_the_end_of_the...,1,5,15227418,,
1,607,1080,0,460,70,326,460,52260,0.08,[[[0.38039216 0.3882353 0.42745098]\n [0.384...,8.964681,528,/home/amos/media/tv/a_murder_at_the_end_of_the...,1,5,15227418,,
2,607,1080,0,144,179,643,144,17465,0.027,[[[0.29803923 0.32156864 0.3254902 ]\n [0.250...,7.455532,576,/home/amos/media/tv/a_murder_at_the_end_of_the...,1,5,15227418,,
3,607,1080,0,169,122,588,169,19693,0.03,[[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]\n .....,7.531884,624,/home/amos/media/tv/a_murder_at_the_end_of_the...,1,5,15227418,,
4,607,1080,1,131,283,323,131,29184,0.045,[[[0. 0. 0.]\n [0. 0. 0.]\n [0. 0. 0.]\n .....,8.216904,624,/home/amos/media/tv/a_murder_at_the_end_of_the...,1,5,15227418,,


In [27]:
print(df.dtypes)
print()
print(encoding_df.dtypes)

img_height        int64
img_width         int64
face_num          int64
x1                int64
x2                int64
y1                int64
y2                int64
area              int64
pct_of_frame    float64
embedding        object
confidence      float64
frame_num         int64
video_src        object
season            int64
episode           int64
dtype: object

filepath     object
encoding     object
series_id     int64
season        int64
episode       int64
frame_num     int64
face_num      int64
dtype: object


In [14]:
embedding = df.iloc[0]['embedding']
embedding

'[[[0.5058824  0.49019608 0.47058824]\n  [0.5019608  0.4862745  0.47058824]\n  [0.49803922 0.47843137 0.47058824]\n  ...\n  [0.45490196 0.4392157  0.43137255]\n  [0.49411765 0.47843137 0.45882353]\n  [0.         0.         0.        ]]\n\n [[0.5176471  0.5019608  0.47843137]\n  [0.5176471  0.49803922 0.47843137]\n  [0.5137255  0.49411765 0.48235294]\n  ...\n  [0.44705883 0.43529412 0.4392157 ]\n  [0.49803922 0.48235294 0.47058824]\n  [0.         0.         0.        ]]\n\n [[0.5294118  0.5058824  0.48235294]\n  [0.53333336 0.5058824  0.4862745 ]\n  [0.52156866 0.5058824  0.4862745 ]\n  ...\n  [0.43137255 0.42745098 0.4392157 ]\n  [0.48235294 0.4745098  0.47058824]\n  [0.         0.         0.        ]]\n\n ...\n\n [[0.12156863 0.10980392 0.1764706 ]\n  [0.13333334 0.13725491 0.23529412]\n  [0.1254902  0.14117648 0.23137255]\n  ...\n  [0.05882353 0.07450981 0.19607843]\n  [0.05490196 0.06666667 0.18039216]\n  [0.         0.         0.        ]]\n\n [[0.1254902  0.12156863 0.19215687]\n 

In [49]:
print(len(files))

81


In [92]:
paths = get_faces('./data/faces_test')
for path in tqdm(paths):
    df = pd.read_csv(str(path), index_col=0)
    series_id = df.iloc[0]['series_id']
    if str(series_id) not in path.stem:
        path.unlink()
    # # if 'video_src' not in df.columns:
    # #     print('no column', str(path))
    # # elif df[df['video_src'].notna()].shape[0] == 0:
    # #     print('no data', str(path))

    # # if 'embedding' in df.columns:
    # #     df = df.drop('embedding', axis=1)
    # series_id = df.iloc[0]['series_id']
    # name = f'{series_id}_{path.stem}.csv'
    # dst = Path(f'./data/faces_new/{path.parent.parts[-1]}')
    # try:
    #     Path.mkdir(dst, parents=True)
    # except:
    #     pass
    # fp = dst.joinpath(name)
    # df.to_csv(str(fp))

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/1928 [00:00<?, ?it/s]

In [97]:
src = './data/faces/1796960/1796960_S01E01.csv'
dst = './data/test/1796960/1796960_S01E01.csv'
df = pd.read_csv(src, index_col=0)
df = df.iloc[:25]
df.to_csv(dst)

In [98]:
import cv2

ModuleNotFoundError: No module named 'cv2'