## This notebook includes

1. use of deepface library to extract visual features from frames extracted from a video segment
2. construct feature sets from each segment in each episode of each seson in the dataset
2. construct a visual feature dataset by combining features and label information.

## Install libraries

In [2]:
#install deepface
!pip install deepface

Collecting deepface
  Downloading deepface-0.0.79-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting Flask>=1.1.2
  Downloading flask-3.0.0-py3-none-any.whl (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.7/99.7 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting mtcnn>=0.1.0
  Downloading mtcnn-0.1.1-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting fire>=0.4.0
  Downloading fire-0.5.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting retina-face>=0.0.1
  Downloading retina_face-0.0.13-py3-none-any.whl (16 kB)
Collecting gunicorn>=20.1.0
  Downloading gunicorn-21.2.0-py3-none-any.wh

## Import libraries

In [4]:
#import libraries
from deepface import DeepFace
import os
import numpy as np
import pandas as pd 

In [5]:
models = [      ## embedding size 
  "VGG-Face",   ## 2622
  "Facenet",    ## 128
  "Facenet512", ## 512
  "OpenFace",   ## 128
  "DeepFace",   ## 4096
  "DeepID",     ## 160
  "ArcFace",    ## 512
  "Dlib",       ## N/A
  "SFace",      ## 128
]

In [6]:
# function to store extracted fetaures in a csv format

def get_features(root, season, feature_columns):
    
    '''
    parameters: 
    root: 
    season: the corresponding season number (s01--s14), this dataset has 14 seasons
    features_columns: feature number based on the embedding dimension
    
    '''
    df = pd.DataFrame(columns = feature_columns)
    
    get_episodes = sorted([episode for episode in os.listdir(root + '/' + season)])
    
    # get each segment from each episode
    for episode in get_episodes:
        get_segments = [seg for seg in os.listdir(root + '/' + season + '/' + episode) if not seg.startswith('.')]
        #print(sorted(get_segments))
        
        #get a list of image files for all segments
        for i, seg_id in enumerate(get_segments):
            file_list = [file for file in os.listdir(root + '/' + season + '/' + episode + '/' + seg_id) if file.endswith('.jpg')]
            
            # apply deep face models to get visual features
            embeddings = []
            for file in file_list:
                embedding = DeepFace.represent(img_path = root + '/' + season + '/' + episode + '/' + seg_id +'/' + file, model_name = model, enforce_detection = False)
                #print(len(embedding))
                embeddings.append(embedding)
            embeddings = np.array(embeddings)   
            #print(embeddings.shape)
            
            #calculate feature value for the whole segment by taking mean and store in the dataframe
            mean_embedding = embeddings.mean(axis = 0)
            df.loc[seg_id, feature_columns] = [feature for feature in mean_embedding]
    
    #save dataframe of features
    df.to_csv('/notebooks/visual_features_{}.csv'.format(season))

## Example for extracting features using deepface library

In [None]:
model = models[3]  # model name, using OpenFace here
root = '/notebooks/frames/'   #root folder where frames from the video are stored

#features, range will change depending on number of features or embedding dimension
feature_columns = ['feature_{}'.format(i) for i in range (1, 129)]

#get a list of all seasons
get_seasons = sorted([season for season in os.listdir(root) if not season.startswith('.')])
print(get_seasons)
# loop through each season
for season in get_seasons:
    print(season)
    
    #pass image directory, season number and feature column names to generate csv file for each seson
    get_features(root, season, feature_columns)

## Prepare dataset

In [14]:
#modifying feature csv for metadata 
def prepare_csv(file):
    '''
    parameters:
    file: csv file corresponding to each season containing features
    
    outputs:
    features: csv file modified to match indexes in metadata
    '''
    features = pd.read_csv('/notebooks/' + file, index_col = [0])

    # this is to make data consistent with the ids used in metadata
    ids = [f for f in features.index]
    ids = [f.replace('_', '-') for f in ids]

    features['id'] = ids  # creating id column with segment index
    features.set_index('id')         # setting segment id as index
    features.reset_index(drop = True, inplace = True) # dropping original index
    features.insert(0, 'id', features.pop('id'))
    
    return features
    

#contanating all features files to create a single dataset
data = []
feature_files = [f for f in os.listdir('/notebooks/') if f.startswith('visual')]
for file in feature_files:
    df = prepare_csv(file)
    data.append(df)

data = pd.concat(data, ignore_index = True)
# data contains feature information from all segments

# get labels from regular segments
dataset = np.load("regularOnly.npy", allow_pickle = True)
dataset = np.delete(dataset, 3889, axis=1)
dataset = np.delete(dataset, 3894, axis=1)
labels = dataset[:, 0:3]
labels = pd.DataFrame(labels, columns = ['id', 'speaker', 'label'])
labels.set_index('id')
#print(len(labels))

# merging labels.csv containing labels and data.csv containing fetaures to create dataset
final_dataset = pd.merge(data, labels, on = 'id')
final_dataset.insert(1, 'speaker', final_dataset.pop('speaker'))
final_dataset.insert(2, 'label', final_dataset.pop('label'))
final_dataset.to_csv('/notebooks/features/data_OpenFace.csv')
fin = pd.read_csv('/notebooks/features/data_OpenFace.csv', index_col = [0])
fin

356


Unnamed: 0,id,speaker,label,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128
0,s05e01-1,Nick Hewer,0,0.027420,0.435477,0.458087,-0.684026,-0.054316,0.393436,0.481086,...,-0.571833,-0.409458,-0.928966,0.243002,0.424839,-0.963481,-0.287715,-1.596323,-0.660836,0.378806
1,s05e01-2,Jack Whitehall,1,-0.678015,0.175186,1.047543,0.221749,-1.134207,0.926230,0.208095,...,-0.030976,0.010946,-1.202579,0.209565,-0.297772,-0.154416,0.307262,-0.894088,-0.823017,1.606982
2,s05e02-2,Kevin Bridges,0,-0.357251,-0.643094,-0.199917,-0.433913,0.916446,-0.459607,0.600942,...,0.199772,-0.336560,0.273521,-0.521398,0.256461,-1.316072,0.263519,0.766841,-0.936954,0.818776
3,s05e02-6,Terry Wogan,0,-0.788874,-0.679031,1.468686,-0.115818,-0.474161,-0.007507,0.576623,...,0.034445,0.522809,-0.236227,-0.652524,0.621462,-0.078983,0.715659,-0.465168,-0.097659,0.444707
4,s05e02-1,Terry Wogan,1,-0.949773,-1.430804,2.030989,0.133840,-1.649042,-0.737661,0.679606,...,0.463612,1.368573,0.129596,-0.619151,0.631930,-0.456874,1.188589,-1.172322,0.290454,0.984179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,s13e08-3,Simon Day,0,-0.104658,0.189366,1.554951,0.711743,0.178346,-0.274975,-0.262494,...,-0.765105,-0.214043,0.726446,-0.425659,0.182968,0.497382,-0.525471,-0.461153,-0.000286,-0.366477
345,s13e08-2,Henning Wehn,1,-0.355148,0.589976,-0.326708,-0.331890,0.715941,-0.807470,-0.247034,...,-0.393348,-0.447856,0.123068,-0.847245,0.259699,0.186877,0.192862,-0.204506,-0.342134,0.062896
346,s13e09-2,James Acaster,1,0.287122,-0.562303,-0.044363,-0.614713,0.453689,0.391049,-0.390755,...,0.370615,-0.324608,-0.622752,-0.699184,0.618566,-0.088922,-0.876691,-0.375804,0.110186,1.537807
347,s13e09-5,Fred Sirieix,0,-0.263636,0.175347,0.446453,-0.042046,1.738206,-0.676627,-0.397312,...,0.712693,-0.552854,0.470888,-0.869323,0.568179,-0.209020,0.348547,0.137631,-1.096667,-0.563966
