# Sort Existing Files 

This notebook produces a list of files based on new C2 positive and negative events. This can be used to take existing files in the original '600' dataset, which had some 'event corruption' (cme events in the non-cme/neg directory and viceversa) and re-map them to the appropriate bins.

## Definitions

In [1]:
import pandas as pd

pos_dataset_name = "../C2_positive_training_dataset.csv"
neg_dataset_name = "../C2_negative_training_dataset.csv"

orig_dataset_name = '../orig_600_files.txt'

new_dataset_script = '../create_new_dataset.sh'

## Load data

In [2]:
pos_data = pd.read_csv(pos_dataset_name)
neg_data = pd.read_csv(neg_dataset_name)

with open (orig_dataset_name, 'r') as f:
    orig_files = f.readlines()
    
# clean: orig filelist newlines removal
orig_files = [x.strip() for x in orig_files]

In [3]:
orig_files[:3]

['cme/max/20000315195100.png',
 'cme/max/20000315215000.png',
 'cme/max/20000315222600.png']

## Build out list of files which belong in cme category

In [4]:
# map datetimes to files
file_exists={}
for f in orig_files:
    filename = f.split('/')[2]
    datetime = filename.split('.')[0]
    
    file_exists[datetime] = f
    

In [5]:
from datetime import datetime

def write_event(evnt, f, file_exists, type="cme"):
    
    dt = datetime.strptime(evnt, "%Y-%m-%d %H:%M:%S") 
    f_dt = dt.strftime("%Y%m%d%H%M00")
    
    if f_dt in file_exists:
        #print (dt, f_dt)
        f.write(f"cp %s %s" % (file_exists[f_dt], f"600_new/{type}/{f_dt}.png\n" ))
    
with open(new_dataset_script, 'w') as f:
    
    f.write(f"#!/bin/sh\n")
    f.write(f"mkdir -p 600_new/cme/\n")
    f.write(f"mkdir -p 600_new/noncme/\n")
 
    # do positive events
    for evnt in pos_data['datetime']:
        write_event(evnt, f, file_exists)
            
    # do negative events
    for evnt in neg_data['datetime']:
        write_event(evnt, f, file_exists, "noncme")
