In [1]:
import pandas as pd
import os
import numpy as np
import pickle

experiment_dir = '/afs/cs.stanford.edu/u/anenberg/scr/snrThesis/data/frames_recognition_20/lists'
filenames = ['fullpath_train_list.txt', 'fullpath_test_list.txt']
num_samples = 100
optical_flow_offset = 0


In [2]:
# load data

is_shuffle = True
is_use_shot_detection = False
output_file_number = 1

for filename in filenames:
    output_file = 'sampled_t%d_%d_%s'%(output_file_number,num_samples, filename)
    if is_use_shot_detection:
        output_file = 'shotdetect_sampled_t%d_%d_%s'%(output_file_number,num_samples, filename)
    if is_shuffle:
        output_file = '%s_%s'%('shuffle', output_file)
    print 'writing %s'%output_file
    df = pd.read_csv(os.path.join(experiment_dir, filename), delimiter = ' ', header = None, names = ['filename', 'class_id'])
    print 'Loaded %s'%filename
    
    df['video_name'] = df.filename.apply(lambda x: x.split('/')[-2])
    df['frame_id'] = df.filename.apply(lambda x: x.split('/')[-1])

    # find the locations of changes
    flags = df.video_name != df.video_name.shift(1)
    flags.append(pd.Series(1, [len(df)]))
    breaks = pd.Series(np.where(flags == True)[0], dtype=np.uint32)
    boundaries = zip(breaks, breaks.shift(-1))[:-1]
    print 'Computed Boundaries'
    
    sub_indices = []

    print 'Finding video samples'
    num_failures = 0
    for boundary in boundaries:
        start, end = int(boundary[0]), int(boundary[1])
        small_df = df[start:end]
        if (end - start < num_samples):
            sub_indices.append(range(start, end))
        else:
            video_name = df.video_name[start]

            if is_use_shot_detection and (video_name.startswith('video_test') or video_name.startswith('video_valid')):
                try:
                    video_length_in_frames = end - start
                    with open(os.path.join('/afs/cs.stanford.edu/u/anenberg/scr/CS231N/allFrames/sceneTransitions/', '%s.pkl'%df.video_name[start]), 'r+') as f:
                        peaks = pickle.load(f)
                    sub_indices.append(cut_and_randomly_select(peaks, num_samples=num_samples, optical_flow_offset =optical_flow_offset, offset = start))
                except:
                    sub_indices.append(np.random.choice(small_df.index, num_samples, replace=False))
                    num_failures += 1
            else:
                if len(small_df.index) < optical_flow_offset+num_samples:
                    continue
                #don't sample from any frame whose optical flow pair is guarenteed to be out of range
                if optical_flow_offset>0:
                    sample_range = small_df.index[:-optical_flow_offset]
                else:
                    sample_range = small_df.index
                sub_indices.append(np.random.choice(sample_range, num_samples, replace=False))
    print 'Done video samples'
    
    
    # turn a list of lists into a flat list
    flat = [x for sublist in sub_indices for x in sublist] 
    # generate the sampled df from the indices
    sampled_df = df.ix[flat]
    if is_shuffle:
        sampled_df = sampled_df.loc[np.random.permutation(sampled_df.index)]
    # write the fields to csv
    sampled_df[['filename', 'class_id']].to_csv(os.path.join(experiment_dir, output_file), header = False, index = False, sep = ' ')
    print num_failures

writing shuffle_sampled_t1_100_fullpath_train_list.txt
Loaded fullpath_train_list.txt
Computed Boundaries
Finding video samples
Done video samples
0
writing shuffle_sampled_t1_100_fullpath_test_list.txt
Loaded fullpath_test_list.txt
Computed Boundaries
Finding video samples
Done video samples
0


In [7]:
df = pd.read_csv(os.path.join(experiment_dir, filenames[1]), delimiter = ' ', header = None, names = ['filename', 'class_id'])