In [1]:
import shutil
from pathlib import Path 

import cv2
import pandas as pd
import plotly.graph_objs as go

In [2]:
def parse_dir(d):
    subdirs = [x for x in Path(d).iterdir()]
    data = []
    for subdir in subdirs:
        files = [x for x in subdir.iterdir()]
        for file in files:
            img = cv2.imread(str(file))
            h, w = img.shape[:2]
            datum = {'fp': str(file),
                     'height': h,
                     'width': w,
                     'pct_of_frame': ((h * w)/(1080 * 1920)) * 100,
                     'cluster': subdir.parts[-1],
                     'algo': d.parts[-1]}
            data.append(datum)
    return data 

In [4]:
data = []
for d in Path('./clusters').iterdir():
    datum = parse_dir(d)
    data.extend(datum)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,fp,cluster,algo
0,clusters/DBSCAN/4/S01E01_23688_1.png,4,DBSCAN
1,clusters/DBSCAN/4/S01E01_16584_1.png,4,DBSCAN
2,clusters/DBSCAN/4/S01E01_25704_0.png,4,DBSCAN
3,clusters/DBSCAN/4/S01E01_8928_0.png,4,DBSCAN
4,clusters/DBSCAN/4/S01E01_28824_3.png,4,DBSCAN


In [5]:
g = df.groupby('algo').count()
g

Unnamed: 0_level_0,fp,cluster
algo,Unnamed: 1_level_1,Unnamed: 2_level_1
DBSCAN,3427,3427
chinese_whispers,3091,3091


In [6]:
chinese_whispers = df[df['algo'] == 'chinese_whispers']
chinese_whispers.head()

Unnamed: 0,fp,cluster,algo
3427,clusters/chinese_whispers/37/S01E01_26688_0.png,37,chinese_whispers
3428,clusters/chinese_whispers/37/S01E01_48720_0.png,37,chinese_whispers
3429,clusters/chinese_whispers/37/S01E01_34032_0.png,37,chinese_whispers
3430,clusters/chinese_whispers/37/S01E01_42720_0.png,37,chinese_whispers
3431,clusters/chinese_whispers/37/S01E01_49608_1.png,37,chinese_whispers


In [7]:
sample = chinese_whispers.sample(n=342)
for idx, row in sample.iterrows():
    fp = Path(row['fp']).absolute()
    dst_dir = Path('sample').joinpath(row['algo'])
    name = f'{Path(row["fp"]).stem}_{row["cluster"]}.png'
    dst = dst_dir.joinpath(name)
    shutil.copy(str(fp), str(dst))


In [8]:
dbscan = df[df['algo'] == 'DBSCAN']
dbscan.head()

Unnamed: 0,fp,cluster,algo
0,clusters/DBSCAN/4/S01E01_23688_1.png,4,DBSCAN
1,clusters/DBSCAN/4/S01E01_16584_1.png,4,DBSCAN
2,clusters/DBSCAN/4/S01E01_25704_0.png,4,DBSCAN
3,clusters/DBSCAN/4/S01E01_8928_0.png,4,DBSCAN
4,clusters/DBSCAN/4/S01E01_28824_3.png,4,DBSCAN


In [9]:
sample = dbscan.sample(n=346)
for idx, row in sample.iterrows():
    fp = Path(row['fp']).absolute()
    dst_dir = Path('sample').joinpath(row['algo'])
    name = f'{Path(row["fp"]).stem}_{row["cluster"]}.png'
    dst = dst_dir.joinpath(name)
    shutil.copy(str(fp), str(dst))

## Validation

In [3]:
data = []
for d in Path('./validation').iterdir():
    datum = parse_dir(d)
    data.extend(datum)

validation_df = pd.DataFrame(data)
validation_df = validation_df.rename({'cluster': 'character'}, axis=1)
validation_df = validation_df.assign(cluster=validation_df['fp'].map(lambda x: Path(x).stem.split('_')[-1]))
validation_df.head()

Unnamed: 0,fp,height,width,pct_of_frame,character,algo,cluster
0,validation/DBSCAN/Nicholas Brody/S01E01_31992_...,169,170,1.385513,Nicholas Brody,DBSCAN,7
1,validation/DBSCAN/Nicholas Brody/S01E01_50400_...,352,352,5.975309,Nicholas Brody,DBSCAN,11
2,validation/DBSCAN/Nicholas Brody/S01E01_50904_...,507,508,12.420718,Nicholas Brody,DBSCAN,7
3,validation/DBSCAN/Nicholas Brody/S01E01_48984_...,353,353,6.009307,Nicholas Brody,DBSCAN,11
4,validation/DBSCAN/Nicholas Brody/S01E01_49872_...,352,352,5.975309,Nicholas Brody,DBSCAN,11


### DBSCAN

In [4]:
validation_dbscan = validation_df[validation_df['algo'] == 'DBSCAN']
validation_dbscan['character'].value_counts()

character
Other              80
Nicholas Brody     58
Carrie Matheson    58
Jessica Brody      47
Saul Berenson      22
Son Brody          20
David Estes        18
Virgil             12
Daughter Brody      9
Mike Faber          9
Guy_from_bar        4
Mrs Walker          4
Max                 4
Asian_Lady          3
Name: count, dtype: int64

In [5]:
data = []
for character in [x for x in validation_dbscan['character'].unique().tolist() if x != 'Other']:
    character_df = validation_dbscan[validation_dbscan['character'] == character]
    m = character_df['cluster'].value_counts().max()
    pct = m/character_df.shape[0]
    datum = {'character': character,
             'pct': pct,
             'count': character_df.shape[0]}
    data.append(datum)
cnt_df = pd.DataFrame(data)
cnt_df = cnt_df.sort_values(by='pct', ascending=False)
cnt_df

Unnamed: 0,character,pct,count
1,Saul Berenson,1.0,22
5,David Estes,1.0,18
7,Mrs Walker,1.0,4
8,Max,1.0,4
2,Virgil,0.916667,12
3,Guy_from_bar,0.75,4
6,Carrie Matheson,0.741379,58
9,Daughter Brody,0.666667,9
11,Mike Faber,0.666667,9
12,Asian_Lady,0.666667,3


In [6]:
temp_df = validation_dbscan.merge(cnt_df[['character', 'pct']],
                                  on='character',
                                  how='left')
temp_df.head()

Unnamed: 0,fp,height,width,pct_of_frame,character,algo,cluster,pct
0,validation/DBSCAN/Nicholas Brody/S01E01_31992_...,169,170,1.385513,Nicholas Brody,DBSCAN,7,0.37931
1,validation/DBSCAN/Nicholas Brody/S01E01_50400_...,352,352,5.975309,Nicholas Brody,DBSCAN,11,0.37931
2,validation/DBSCAN/Nicholas Brody/S01E01_50904_...,507,508,12.420718,Nicholas Brody,DBSCAN,7,0.37931
3,validation/DBSCAN/Nicholas Brody/S01E01_48984_...,353,353,6.009307,Nicholas Brody,DBSCAN,11,0.37931
4,validation/DBSCAN/Nicholas Brody/S01E01_49872_...,352,352,5.975309,Nicholas Brody,DBSCAN,11,0.37931


In [7]:
temp_df['pct'].mean()

0.6529850746268656

### Chinese Whispers

In [8]:
validation_chinese = validation_df[validation_df['algo'] == 'chinese_whispers']
validation_chinese['character'].value_counts()

character
Nicholas Brody      66
Carrie Matheson     64
Jessica Brody       51
other               41
Saul Berenson       23
David Estes         22
Daughter Brody      17
Virgil              12
Son Brody           12
Mike Faber          11
Max                  6
Random White Guy     5
Mrs. Walker          4
Random Arab Guy      4
Random Officer       3
14                   1
Name: count, dtype: int64

In [9]:
data = []
for character in [x for x in validation_chinese['character'].unique().tolist() if x != 'other']:
    character_df = validation_chinese[validation_chinese['character'] == character]
    m = character_df['cluster'].value_counts().max()
    pct = m/character_df.shape[0]
    datum = {'character': character,
             'pct': pct,
             'count': character_df.shape[0]}
    data.append(datum)
cnt_df = pd.DataFrame(data)
cnt_df = cnt_df.sort_values(by='pct', ascending=False)
cnt_df

Unnamed: 0,character,pct,count
1,Random Officer,1.0,3
4,Son Brody,1.0,12
7,Random White Guy,1.0,5
8,Mrs. Walker,1.0,4
11,Random Arab Guy,1.0,4
13,14,1.0,1
10,Daughter Brody,0.941176,17
3,Virgil,0.916667,12
2,Saul Berenson,0.913043,23
5,David Estes,0.818182,22


In [10]:
temp_df = validation_chinese.merge(cnt_df[['character', 'pct']],
                                   on='character',
                                   how='left')
temp_df.head()

Unnamed: 0,fp,height,width,pct_of_frame,character,algo,cluster,pct
0,validation/chinese_whispers/Nicholas Brody/S01...,245,245,2.894724,Nicholas Brody,chinese_whispers,37,0.772727
1,validation/chinese_whispers/Nicholas Brody/S01...,40,39,0.075231,Nicholas Brody,chinese_whispers,37,0.772727
2,validation/chinese_whispers/Nicholas Brody/S01...,170,170,1.393711,Nicholas Brody,chinese_whispers,37,0.772727
3,validation/chinese_whispers/Nicholas Brody/S01...,352,352,5.975309,Nicholas Brody,chinese_whispers,37,0.772727
4,validation/chinese_whispers/Nicholas Brody/S01...,352,352,5.975309,Nicholas Brody,chinese_whispers,37,0.772727


In [11]:
temp_df['pct'].mean()

0.8239202657807309

In [13]:
layout = go.Layout(title={'text': '',
                          'font': {'size': 22,
                                   'family': 'Raleway',
                                   'color': 'white'},
                          'x': 0.5,
                          'y': 0.9,
                          'xanchor': 'center',
                          'yanchor': 'top'},
                   xaxis={'title': '',
                          'tickfont': {'size': 14,
                                       'family': 'Roboto',
                                       'color': 'white'},
                          'titlefont': {'size': 18,
                                        'family': 'Raleway',
                                        'color': 'white'}},
                   yaxis={'title': '',
                          'tickfont': {'size': 14,
                                       'family': 'Roboto',
                                       'color': 'white'},
                          'titlefont': {'size': 18,
                                        'family': 'Raleway',
                                        'color': 'white'}},
                   font={'color': 'white'},
                   paper_bgcolor='#5e5e5e',
                   plot_bgcolor='rgba(61, 61, 61, 0)')

In [17]:
data = [go.Bar(x=['DBSCAN', 'Chinese Whispers'], 
                   y=[0.653, 0.824],
                   text=[0.653, 0.824])]
fig = go.Figure(data=data, layout=layout)
fig.show()