In [1]:
import pandas as pd
import os
import numpy as np
import pickle

experiment_dir = '/afs/cs.stanford.edu/u/anenberg/scr/snrThesis/data/frames_detection/lists'
filenames = ['fullpath_train_list.txt', 'fullpath_test_list.txt']
num_samples = 5
optical_flow_offset = 0


In [3]:
def cut_and_randomly_select(peaks, num_samples = 5, optical_flow_offset = 30,offset = 0):
    """
        peaks - an array [10, 19, 30, 80]
        optical_flow_offset - make sure that the frame and the frame + optical_flow_offset are in 'good' regions
        offset - a number to add to all the indices

if optical_flow_offset = 0, then don't worry if the corresponding optical flow frame is in bounds.
    """
    peaks_full = peaks
    # zip the peaks together to get the bounds e.g. [10, 19, 30, 80] --> [(10, 19), (19, 30), (30, 80)]
    df = pd.DataFrame(zip(peaks_full[:-1], peaks_full[1:]), columns = ['left', 'right'])

    df['scene_length'] = df.right - df.left
    
    # group is valid only if the bound length is greater than the median bound length
    #df['valid'] = df.scene_length >= df.scene_length.median()
    
    #group is valid only if the bound length is within the middle 3/5 of the bound length distribution.
    df['valid'] = np.logical_and(df.scene_length >= 0.2*df.scene_length.max(),df.scene_length < 0.8*df.scene_length.max())
    valid_df = df[df.valid == True]
    
    valid_indices = []
    for l, r in zip(valid_df.left.values, valid_df.right.values):
        valid_indices.extend(range(l, r))
    
    # making sure the optical flow is valid
    valid_indices = [i for i in valid_indices if i + optical_flow_offset in valid_indices]
    
    if len(valid_indices) == 0:
        return []
    else:
        # select num_samples at random
        return np.random.choice(valid_indices, min(num_samples, len(valid_indices)), replace=False) + offset
    
print cut_and_randomly_select([10, 18, 30, 80, 90], num_samples=5, optical_flow_offset=0, offset = 0)

[18 25 22 21 23]


In [47]:

def KLsamples(distribution,k,l):
    """
    k: number of sample/sets to draw
    l: length of consecutive integers in a sample/set
    
    returns a length k list of lists of length l. 
    """
    num_bins = len(distribution)/l
    bins = range(num_bins)
    k_to_sample = min(num_bins,k)
    sampled_bins = np.random.choice(bins,k_to_sample, replace=False)
    
    returnLists = []
    for b in sampled_bins:
        segment = distribution[b*l:(b+1)*l]
        assert len(segment) == l
        returnLists.append(segment)
    return returnLists

def cut_and_randomly_select_stacked(peaks, num_samples = 5, stacked=1, offset = 0):
    """
        peaks - an array [10, 19, 30, 80]
        offset - a number to add to all the indices
        stacked - number of consecutive frames to include in one sample set (1 = just the single frame)
        if optical_flow_offset = 0, then don't worry if the corresponding optical flow frame is in bounds.
    """
    peaks_full = peaks
    # zip the peaks together to get the bounds e.g. [10, 19, 30, 80] --> [(10, 19), (19, 30), (30, 80)]
    df = pd.DataFrame(zip(peaks_full[:-1], peaks_full[1:]), columns = ['left', 'right'])

    df['scene_length'] = df.right - df.left
    
    # group is valid only if the bound length is greater than the median bound length
    #df['valid'] = df.scene_length >= df.scene_length.median()
    
    #group is valid only if the bound length is within the middle 3/5 of the bound length distribution.
    df['valid'] = np.logical_and(df.scene_length >= 0.2*df.scene_length.max(),df.scene_length < 0.8*df.scene_length.max())
    valid_df = df[df.valid == True]
    
    valid_indices = []
    for l, r in zip(valid_df.left.values, valid_df.right.values):
        valid_indices.extend(range(l, r))
    
    # making sure the optical flow is valid
    valid_indices = [i for i in valid_indices if i + stacked-1 in valid_indices]
    
    if len(valid_indices) == 0:
        return []
    else:
        # select num_samples at random
        return KLsamples(valid_indices, num_samples,stacked)
        #flat = [x for sublist in indices_lists for x in sublist] 
        #return flat

print cut_and_randomly_select_stacked([10, 18, 31, 80, 90], num_samples=5, stacked=2, offset = 0)

[[86, 87], [80, 81], [24, 25], [26, 27], [18, 19]]


In [30]:
peaks = [10, 19, 30, 80]
peaks_full = peaks
df = pd.DataFrame(zip(peaks_full[:-1], peaks_full[1:]), columns = ['left', 'right'])
df['scene_length'] = df.right - df.left

In [6]:
df = pd.read_csv(os.path.join(experiment_dir, filenames[0]), delimiter = ' ', header = None, names = ['filename', 'class_id'])


In [19]:
len(df.index[:-optical_flow_offset])

8691191

In [2]:
# load data

is_shuffle = True
is_use_shot_detection = False
output_file_number = 1

for filename in filenames:
    output_file = 'sampled_t%d_%d_%s'%(output_file_number,num_samples, filename)
    if is_use_shot_detection:
        output_file = 'shotdetect_sampled_t%d_%d_%s'%(output_file_number,num_samples, filename)
    if is_shuffle:
        output_file = '%s_%s'%('shuffle', output_file)
    print 'writing %s'%output_file
    df = pd.read_csv(os.path.join(experiment_dir, filename), delimiter = ' ', header = None, names = ['filename', 'class_id'])
    print 'Loaded %s'%filename
    
    df['video_name'] = df.filename.apply(lambda x: x.split('/')[-2])
    df['frame_id'] = df.filename.apply(lambda x: x.split('/')[-1])

    # find the locations of changes
    flags = df.video_name != df.video_name.shift(1)
    flags.append(pd.Series(1, [len(df)]))
    breaks = pd.Series(np.where(flags == True)[0], dtype=np.uint32)
    boundaries = zip(breaks, breaks.shift(-1))[:-1]
    print 'Computed Boundaries'
    
    sub_indices = []

    print 'Finding video samples'
    num_failures = 0
    for boundary in boundaries:
        start, end = int(boundary[0]), int(boundary[1])
        small_df = df[start:end]
        if (end - start < num_samples):
            sub_indices.append(range(start, end))
        else:
            video_name = df.video_name[start]

            if is_use_shot_detection and (video_name.startswith('video_test') or video_name.startswith('video_valid')):
                try:
                    video_length_in_frames = end - start
                    with open(os.path.join('/afs/cs.stanford.edu/u/anenberg/scr/CS231N/allFrames/sceneTransitions/', '%s.pkl'%df.video_name[start]), 'r+') as f:
                        peaks = pickle.load(f)
                    sub_indices.append(cut_and_randomly_select(peaks, num_samples=num_samples, optical_flow_offset =optical_flow_offset, offset = start))
                except:
                    sub_indices.append(np.random.choice(small_df.index, num_samples, replace=False))
                    num_failures += 1
            else:
                if len(small_df.index) < optical_flow_offset+num_samples:
                    continue
                #don't sample from any frame whose optical flow pair is guarenteed to be out of range
                if optical_flow_offset>0:
                    sample_range = small_df.index[:-optical_flow_offset]
                else:
                    sample_range = small_df.index
                sub_indices.append(np.random.choice(sample_range, num_samples, replace=False))
    print 'Done video samples'
    
    
    # turn a list of lists into a flat list
    flat = [x for sublist in sub_indices for x in sublist] 
    # generate the sampled df from the indices
    sampled_df = df.ix[flat]
    if is_shuffle:
        sampled_df = sampled_df.loc[np.random.permutation(sampled_df.index)]
    # write the fields to csv
    sampled_df[['filename', 'class_id']].to_csv(os.path.join(experiment_dir, output_file), header = False, index = False, sep = ' ')
    print num_failures

writing shuffle_sampled_t1_5_fullpath_train_list.txt
Loaded fullpath_train_list.txt
Computed Boundaries
Finding video samples
Done video samples
0
writing shuffle_sampled_t1_5_full_path_test_list.txt


IOError: File /afs/cs.stanford.edu/u/anenberg/scr/snrThesis/data/frames_detection/lists/full_path_test_list.txt does not exist

In [49]:
# load data

is_shuffle = True
is_use_shot_detection = True
output_file_number = 1
stacked = 5

for filename in filenames:
    output_file = 'stacked_%d_sampled_t%d_%d_%s'%(stacked,output_file_number,num_samples, filename)
    if is_use_shot_detection:
        output_file = 'stacked_%d_shotdetect_sampled_t%d_%d_%s'%(stacked,output_file_number,num_samples, filename)
    if is_shuffle:
        output_file = '%s_%s'%('shuffle', output_file)
    print 'writing %s'%output_file
    df = pd.read_csv(os.path.join(experiment_dir, filename), delimiter = ' ', header = None, names = ['filename', 'class_id'])
    print 'Loaded %s'%filename
    
    df['video_name'] = df.filename.apply(lambda x: x.split('/')[0])
    df['frame_id'] = df.filename.apply(lambda x: x.split('/')[1])

    # find the locations of changes
    flags = df.video_name != df.video_name.shift(1)
    flags.append(pd.Series(1, [len(df)]))
    breaks = pd.Series(np.where(flags == True)[0], dtype=np.uint32)
    boundaries = zip(breaks, breaks.shift(-1))[:-1]
    print 'Computed Boundaries'
    
    sub_indices = []

    print 'Finding video samples'
    num_failures = 0
    for boundary in boundaries:
        start, end = int(boundary[0]), int(boundary[1])
        small_df = df[start:end]
        tmp_num_samples = num_samples
        #decrease number of samples until there are enough frames in video to sample .
        while (end - start < num_samples*stacked):
            tmp_num_samples -= 1
        if tmp_num_samples >=0:
            video_name = df.video_name[start]

            if is_use_shot_detection and (video_name.startswith('video_test') or video_name.startswith('video_valid')):
                try:
                    video_length_in_frames = end - start
                    with open(os.path.join('./data/allFrames/sceneTransitions/', '%s.pkl'%df.video_name[start]), 'r+') as f:
                        peaks = pickle.load(f)
                    #appends a list of lists
                    sub_indices.append(cut_and_randomly_select_stacked(peaks, num_samples = tmp_num_samples, stacked=stacked, offset = start))
                except: 
                    #pass, Don't want to randomly add indices.
                    #sub_indices.append(np.random.choice(small_df.index, num_samples, replace=False))
                    num_failures += 1
            else:
                #don't sample from any frame whose optical flow pair is guarenteed to be out of range
                sub_indices.append(KLsamples(small_df.index,tmp_num_samples,stacked))
    print 'Done video samples'
    
    #turn list of lists of lists int list of lists
    flat = [x for sublist in sub_indices for x in sublist]
    if is_shuffle: 
        flat = np.random.permutation(flat)
    # turn a list of lists into a flat list
    flat = [x for sublist in flat for x in sublist]
    
    # generate the sampled df from the indices
    sampled_df = df.ix[flat]
    # write the fields to csv
    sampled_df[['filename', 'class_id']].to_csv(os.path.join(experiment_dir, output_file), header = False, index = False, sep = ' ')
    print num_failures

writing shuffle_stacked_5_shotdetect_sampled_t1_5_train_list.txt
Loaded train_list.txt
Computed Boundaries
Finding video samples
Done video samples
272
writing shuffle_stacked_5_shotdetect_sampled_t1_5_test_list.txt
Loaded test_list.txt
Computed Boundaries
Finding video samples
Done video samples
481


In [64]:
A = sampled_df[:10].sort('class_id')

In [45]:
A = [[1,2,3,4],[5,6]]
B = [[7,8,9],[10,11]]
C = [A, B]
flat = [x for sublist in C for x in sublist]
print flat
np.random.permutation(flat)

[[1, 2, 3, 4], [5, 6], [7, 8, 9], [10, 11]]


array([[5, 6], [1, 2, 3, 4], [10, 11], [7, 8, 9]], dtype=object)

In [111]:
!head sampled_training.txt

v_ApplyEyeMakeup_g08_c01/00000089.jpg 0
v_ApplyEyeMakeup_g08_c01/00000105.jpg 0
v_ApplyEyeMakeup_g08_c01/00000073.jpg 0
v_ApplyEyeMakeup_g08_c01/00000024.jpg 0
v_ApplyEyeMakeup_g08_c01/00000083.jpg 0
v_ApplyEyeMakeup_g08_c02/00000083.jpg 0
v_ApplyEyeMakeup_g08_c02/00000049.jpg 0
v_ApplyEyeMakeup_g08_c02/00000073.jpg 0
v_ApplyEyeMakeup_g08_c02/00000101.jpg 0
v_ApplyEyeMakeup_g08_c02/00000094.jpg 0


In [6]:
!wc ../CS231N/data/allFrames/lists/shuffle_shotdetect_sampled_5_train_list.txt

  71530  143060 2709965 ../CS231N/data/allFrames/lists/shuffle_shotdetect_sampled_5_train_list.txt


In [26]:
import random
X = [1,2,3]
random.sample(X,1)[0]

3

In [71]:
#df['valid'] = np.logical_and(df.scene_length >= 0.2*df.scene_length.max(),df.scene_length < 0.8*df.scene_length.max())

A = [1,2,3,4,5,6,7,8,9]
pairs = []
for i in xrange(0,len(A),3):
    for j in xrange(1,3):
        pairs.append((A[i+j-1],A[i+j]))
print pairs


[(1, 2), (2, 3), (4, 5), (5, 6), (7, 8), (8, 9)]


In [70]:
A['valid'] = A

1032    video_test_0000001/00001033.jpg
1033    video_test_0000001/00001034.jpg
1034    video_test_0000001/00001035.jpg
1035    video_test_0000001/00001036.jpg
1036    video_test_0000001/00001037.jpg
2393    video_test_0000002/00000326.jpg
2394    video_test_0000002/00000327.jpg
2395    video_test_0000002/00000328.jpg
2396    video_test_0000002/00000329.jpg
2397    video_test_0000002/00000330.jpg
Name: filename, dtype: object

In [74]:
B = [True if i%3==0 else False for i in xrange(len(A)) ]
print B

[True, False, False, True, False, False, True, False, False]
