In [None]:
#default_exp wrangling

# Data wrangling

In [None]:
#hide

from fastcore.test import *
from nbdev.showdoc import *

In [None]:
#export
from pathlib import Path
import pandas as pd
from fastcore.basics import *
from fastcore.xtras import *

from vqa.utils import most_common

In [None]:
#export
from pathlib import Path
import pandas as pd
from fastcore.basics import *

from vqa.utils import most_common

In [None]:
#export 
import re

_disto_pattern = re.compile('_(D\d)')
_sev_pattern = re.compile('_(\d)')
def parse_distortion_severity(video_name):
    distortions = disto_pattern.findall(video_name)
    sevs = sev_pattern.findall(video_name)
    if len(distortions)==0: # reference video
        return []
    if len(sevs)==1:
        sevs = sevs*len(distortions)
    assert len(distortions)==len(sevs)
    sevs = [int(sev) for sev in sevs] 
    return sorted(f"{disto}_{sev}" for disto, sev in zip(distortions, sevs))

def parse_scene(video_name):
    return video_name.split("_D", 1)[0].lower()

In [None]:
#hide
test_eq([], parse_distortion_severity('Airport'))
test_eq(['D9_1'], parse_distortion_severity('Airport_D9_1'))
test_eq(['D1_2', 'D3_2'], parse_distortion_severity('Airport_D3_D1_2'))
test_eq(['D3_2', 'D5_1'], parse_distortion_severity('Airport_D3_2_D5_1'))
test_eq(['D2_4', 'D7_4', 'D9_4'], parse_distortion_severity('Airport_D2_D7_D9_4'))

In [None]:
#hide
test_eq('airport', parse_scene('Airport'))
test_eq('airport', parse_scene('Airport_D2_D7_D9_4'))
test_eq('airport', parse_scene('Airport_D2_3_D9_4'))

In [None]:
#export
def label_dataframe(df):
    df['scene'] = df['video_name'].apply(parse_scene)
    df['label'] = df['video_name'].apply(parse_distortion_severity).apply(lambda labels: 'R_0' if len(labels)==0 else ','.join(labels))
    df['distortion'] = df['label'].apply(lambda s: '_'.join(ds.split('_')[0] for ds in s.split(',')))
    df['severity'] = df['label'].apply(lambda s: most_common(ds.split('_')[1] for ds in s.split(',')))
    return df

In [None]:
#hide
df = pd.DataFrame(['Airport', 'Landing_plane_D2_D3_1',  'Airport_D2_3_D9_1', 'Bus_Stop_D2_D5_D9_2'], columns=['video_name'])
ldf = df.pipe(label_dataframe)
test_eq(ldf['scene'].values, ['airport', 'landing_plane', 'airport', 'bus_stop'])
test_eq(ldf['label'].values, ['R_0', 'D2_1,D3_1', 'D2_3,D9_1', 'D2_2,D5_2,D9_2'])
test_eq(ldf['distortion'].values, ['R', 'D2_D3', 'D2_D9', 'D2_D5_D9'])
test_eq(ldf['severity'].values, ['0', '1', '3', '2'])

In [None]:
#export

from sklearn.model_selection import train_test_split

def make_dataframe_splitter(valid_pct, strata='label'):
    def stratified_split(df):
        _, val_index = train_test_split(df.index, test_size=valid_pct, stratify=df[strata])
        df['is_valid'] = False
        df.loc[val_index, 'is_valid'] = True
        return df
    return stratified_split

In [None]:
#export

def populate_frames(df, frame_indices_list):
    for frame_indices in frame_indices_list:
        df = df.copy()
        df['frames'] = [frame_indices]*len(df)
        df['frame_paths'] = df['video_path'].apply(lambda vn: [str(Path(vn) / f"{i}.jpg") for i in frame_indices])
        yield df

def make_framer(frame_indices_list):
    return lambda dataf: pd.concat(list(populate_frames(dataf, frame_indices_list)), axis=0)

In [None]:
#export

def remove_corrupt_video_frames(df):
    video_names = [
     'Concorde_place_D1_D5_1',
     'Concorde_place_D6_1',
     'Concorde_place_D1_D5_4',
     'Concorde_place_D1_D5_3',
     'Concorde_place_D1_D5_2',
     'Concorde_place_D6_2',
     'Concorde_place_D6_3',
     'Concorde_place_D6_4'
    ]
    tbd_idx = tbd_idx = df['video_name'].isin(video_names) & df['frames'].apply(lambda seq: max(seq) > 33)
    return df[~tbd_idx].copy()

In [None]:
#export

def make_dataframe(root):
    video_paths = sorted([str(p) for p in root.ls() if not p.name.startswith('.')])
    df = pd.DataFrame(data=dict(video_path=video_paths))
    df['video_name'] = df['video_path'].apply(lambda p: Path(p).name)
    return df

In [None]:
#export

def make_train_dataframe(root, valid_pct, frame_indices_list):
    return (
        make_dataframe(root)
        .pipe(label_dataframe)
        .pipe(make_dataframe_splitter(valid_pct))
        .pipe(make_framer(frame_indices_list))
        .pipe(remove_corrupt_video_frames)
    )

In [None]:
#export

def assert_stratied_split(df, label_col):
    train_df, val_df = df[~df['is_valid']], df[df['is_valid']]
    ratio = len(val_df) / len(train_df)
    label_freqs = pd.concat([train_df[label_col].value_counts(), val_df[label_col].value_counts() ], axis=1, join='outer')
    label_freqs.columns=['train', 'val']
    label_freqs['ratio'] = (label_freqs['val'] / label_freqs['train']).apply(abs)
    # assert that difference in class counts is less than 1% of dataset size
    assert ratio - 0.02 < label_freqs['ratio'].mean() < ratio + 0.02, label_freqs['ratio'].min()

In [None]:
#export

def make_test_dataframe(root, frame_indices_list):
    return (
        make_dataframe(root)
        .pipe(label_dataframe)
        .pipe(make_framer(frame_indices_list))
    )