In [None]:
%pylab inline
import pandas as pd

In [None]:
num_segments = 100
num_targets_mean = 25

num_targets = random.poisson(num_targets_mean, num_segments)

# segment-level means are drawn from Uniform(-5, 5)
means = random.uniform(-5, 5, num_segments)

# target-level log_2 coverages are drawn from 0.95 * Norm(mean, sqrt(var)) + 0.05 * Uniform(-10, 10)
variance = 1.
outlier_prob = 0.025
target = 0
coverages = []
outlier_indicators = []
for s in range(num_segments):
    for t in range(num_targets[s]):
        if random.choice([True, False], p=[outlier_prob, 1 - outlier_prob]):
            coverages += [random.uniform(-10, 10)]
            outlier_indicators += [1]
        else:
            coverages += [random.normal(means[s], sqrt(variance))]
            outlier_indicators += [0]
        target += 1
coverages = array(coverages)

In [None]:
fig = plt.figure(figsize=(20, 4))
scatter(range(len(coverages)), array(coverages))
xlim([0, len(coverages)])
show()

In [None]:
test_dir = '../src/test/resources/org/broadinstitute/tools/exome/'

In [None]:
# convert num_targets into a segment file; all targets span 10 positions, chromosomes each have 10 segments
# convert coverages into a target-coverage file; all targets span 10 positions
sample = 'test'
name_prefix = 't'
chromosome_names = sort([str(i) for i in range(1, int(num_segments / 10) + 1)])

segment_pd = pd.DataFrame(columns=['Sample', 'Chromosome', 'Start', 'End'])
coverages_pd = pd.DataFrame(columns=['name', 'contig', 'start', 'stop', sample])

end = 0
target_index = 0
for segment_index in range(num_segments):
    if segment_index % 10 == 0:
        end = 0
    chromosome = chromosome_names[int(segment_index / 10)]
#     chromosome = str(int(segment_index / 10) + 1)   #for non-genomic order
    start = end + 1
    end = end + 10 * num_targets[segment_index]
    segment_pd.loc[segment_index] = [sample, chromosome, start, end]
    for target_start in range(start, end, 10):
        name = name_prefix + str(target_index + 1)
        coverages_pd.loc[target_index] = [name, 
                                          chromosome, target_start, target_start + 9, 
                                          around(coverages[target_index], decimals=2)]
        target_index += 1
        
segment_pd['Start'] = segment_pd['Start'].astype(int)
segment_pd['End'] = segment_pd['End'].astype(int)
coverages_pd['start'] = coverages_pd['start'].astype(int)
coverages_pd['stop'] = coverages_pd['stop'].astype(int)

segment_pd.to_csv(test_dir + 'segments-for-copy-ratio-modeller-test.tsv', sep='\t', index=False)
coverages_pd.to_csv(test_dir + 'coverages-for-copy-ratio-modeller-test.tsv', sep='\t', index=False)
savetxt(test_dir + 'means-truth-for-copy-ratio-modeller-test.txt', means, fmt='%.3f')
savetxt(test_dir + 'outlier-indicators-truth-for-copy-ratio-modeller-test.txt', outlier_indicators, fmt='%d')