In [19]:
args = dict()
args['in_file'] = '3.Allen2014_DMSO2_3-19_bidirectionals.sorted.fullConScores.bedgraph'
args['in_dir'] = '/media/ab/data/Research_Data/Enhancers/ConservationScores/fullConScores/new/'
args['chr'] = None
args['start'] = None
args['end'] = None
args['val'] = None
args['out_dir'] = 'formatted'

In [None]:
#!/usr/bin/env python
#
#######################
#
#
##################


### Import modules
import sys
import numpy as np
import math
import csv
import os
from operator import add
import argparse

parser = argparse.ArgumentParser(description='Reformat the bedgraph file')
parser.add_argument('--in_file',  type=str, help='Input file')
parser.add_argument('--chr', type=str, nargs='+', help='Column for chromosome name')
parser.add_argument('--start',   type=int, help='Column for start position')
parser.add_argument('--end',type=int, help='Column for end position')
parser.add_argument('--val',    type=int, help='Column for value position')
parser.add_argument('--in_dir',    type=str, help='Input directory (if other than current)')
parser.add_argument('--out_dir',    type=str, help='Output directory (if other than current)')

def get_args():
    '''
    Return the argparse arguments as a dictionary

    '''
    args = vars(parser.parse_args())
    return args

def main():

    # Get the command line arguments
    #args = get_args()

    # Set the configuration settings dictionary
    input_settings = configure_settings(args)

    # Open the main graph data file and line labels
    raw_line_data = import_data_from_file(input_settings['input_dir']+input_settings['input_file'])

    # Process the main graph data file and line labels
    line_data = parse_data_from_lines(input_settings,raw_line_data)

    #print 'line_data', line_data
    # Create output directory (if necessary) and prepare output file
    output_folder = create_output_folder(input_settings['input_dir']+input_settings['output_dir']+'/')

    # Write the data to an output file
    write_to_output(input_settings,output_folder, line_data)

    # Exit
    sys.exit(0)


def parse_data_from_lines(input_settings,datafile):

    # This functions parses the raw data into a format that can be easily
    # used, based on the columns indicated by the command line inputs. It
    # returns a list of lists


    # Create the lists for the output data
    output = []
    #    for i in range(len(input_settings['line_ID'])):
    #    output_data.append([])
    # Setup the index counters
    #range_index = 0 # The index of the data in the range
    #data_index = 1 #The index of the lists in output_data

    # Copy these values from input settings to make the next section easier to read
    chr_col = input_settings['chr']
    start_col = input_settings['start']
    end_col = input_settings['end']
    value_col = input_settings['val']

    # For each line in the input ...
    for line in datafile:
        # Split the line by tabs and remove ending return character
        split_line = line.strip('\n').split('\t')

        # If its the first set of values, create the x axis index
        if (int(split_line[end_col]) - int(split_line[start_col])) == 1:
            output.append(split_line)
        else:
            positions = np.arange(int(split_line[start_col]),  int(split_line[end_col]), 1)
            for i in positions:
                output.append([(split_line[chr_col]), i, i+1, (split_line[value_col])])
                


    return output


def create_output_folder(new_folder):

    # Checks to see if a directory exists, and if not, makes the directory. It
    # returns the directory path as output

    if not os.path.exists(new_folder):
        os.mkdir(new_folder)
    return new_folder

def configure_settings(args):

    # Initializes the file input and matplotlib settings. The graph settings
    # can be changed, but the input settings should not be. The function returns
    # the input and graph settings.

    print args

    ########## Do not change these!
    input_settings = {
        'input_file':args['in_file'],
        'input_dir':args['in_dir'],
        'output_dir':args['out_dir'],
        'chr':args['chr'],
        'start':args['start'],
        'end':args['end'],
        'val':args['val']
        }
    
    if input_settings['input_dir'] is None:
        input_settings['input_dir'] = ''
    if input_settings['output_dir'] is None:
        input_settings['output_dir'] = input_settings['input_dir']
    if input_settings['chr'] is None:
        input_settings['chr'] = 0
    if input_settings['start'] is None:
        input_settings['start'] = 1
    if input_settings['end'] is None:
        input_settings['end'] = 2
    if input_settings['val'] is None:
        input_settings['val'] = 3
    
    return input_settings


def import_data_from_file(filename):

    # This function opens a file from the disk. It returns the raw data.
    try:
        return open(filename)
    except IOError:
        print "Unable to open file: %s" %(filename)


def write_to_output(input_settings, output_folder, data):

    # Save the line data to an output text file in the same directory as
    # the graphs

    # Create writer
    ofile  = open(output_folder+input_settings['input_file']+'_new.bedgraph', "wb")
    writer = csv.writer(ofile, delimiter='\t')

    # Add the headers
    #writer.writerow(input_settings['line_ID'])

    for row in data:
        writer.writerow(row)
    
    # Write the data
    #for i in range(len(data[0])):
    #    row = []
    #    for j in range(len(data)):
    #        row.append(data[j][i])
    #    writer.writerow(row)

    # Close the file
    ofile.close()
    return


if __name__ == "__main__":
    sys.exit(main())

{'end': None, 'val': None, 'out_dir': 'formatted', 'start': None, 'chr': None, 'in_file': '3.Allen2014_DMSO2_3-19_bidirectionals.sorted.fullConScores.bedgraph', 'in_dir': '/media/ab/data/Research_Data/Enhancers/ConservationScores/fullConScores/new/'}
