-
Notifications
You must be signed in to change notification settings - Fork 0
/
parallel_sim.sh
executable file
·238 lines (198 loc) · 9.89 KB
/
parallel_sim.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/bin/bash
source ~/miniconda3/etc/profile.d/conda.sh
parallel_sim_name="simEvoCompareBulkMutrates_FlexibleTP1bpEndMismatch_7_17_23"
running_machina=false ### SET TO TRUE IF RUNNING MACHINA
# Set the mutation rates to explore
# Empirical mutation rates
#mr1=(0.3173305,0.0009823,0.00000008,0.00000162,0.00000335,0.18609872,0.00000254,0,0,0)
# Easy alignment mode mutation rates
#mr1=(0.1264,0.1264,0.1264,0.1264,0.1264,0.1264,0.1264,0.1264,0.1264,0.1264)
#mr1=(0,0,0,0,0.1,0,0,0,0,0)
#mr_array=("$mr1")
# mr1=(0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001)
# mr2=(0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005)
mr1=(0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01)
mr2=(0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02)
mr3=(0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03)
mr4=(0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04)
mr5=(0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05)
mr6=(0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06,0.06)
mr7=(0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07)
mr8=(0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08)
mr9=(0.09,0.09,0.09,0.09,0.09,0.09,0.09,0.09,0.09,0.09)
mr10=(0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1)
mr11=(0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15,0.15)
mr12=(0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2)
mr13=(0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3)
mr14=(0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4)
mr15=(0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5)
# mr18=(0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6)
# mr19=(0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.7)
# mr20=(0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8)
# mr21=(0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9,0.9)
# mr22=(1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0)
mr_array=("$mr1" "$mr2" "$mr3" "$mr4" "$mr5" "$mr6" "$mr7" "$mr8" "$mr9" "$mr10" "$mr11" "$mr12" "$mr13" "$mr14" "$mr15")
# Set the sample sizes to explore
ss_array=(100)
# Set the migration matrix values that will be tested
# ultra_rare_mm="data/ultrarare_migration_prob_matrix.csv"
# rare_mm="data/rare_migration_prob_matrix.csv"
#moderate_mm="data/moderate_migration_prob_matrix.csv"
# true_mm="data/true_migration_prob_matrix.csv"
# high_mm="data/high_migration_prob_matrix.csv"
#equal_mm="data/equal_migration_prob_matrix.csv"
# mm_array=(${rare_mm} ${true_mm} ${high_mm})
# mm_array=(${true_mm})
mm_1="data/migration_matrices/migration_matrix_001.csv" ## 0.001 migration rate split between all tissues
mm_2="data/migration_matrices/migration_matrix_005.csv" ## 0.005 migration rate split between all tissues
mm_3="data/migration_matrices/migration_matrix_01.csv" ## 0.01 migration rate split between all tissues
mm_4="data/migration_matrices/migration_matrix_025.csv" ## 0.025 migration rate split between all tissues
mm_5="data/migration_matrices/migration_matrix_05.csv" ## 0.05 migration rate split between all tissues
mm_6="data/migration_matrices/migration_matrix_075.csv" ## 0.075 migration rate split between all tissues
mm_7="data/migration_matrices/migration_matrix_1.csv" ## 0.1 migration rate split between all tissues
mm_8="data/migration_matrices/migration_matrix_25.csv" ## 0.25 migration rate split between all tissues
mm_9="data/migration_matrices/migration_matrix_50.csv" ## 0.50 migration rate split between all tissues
mm_10="data/migration_matrices/migration_matrix_75.csv" ## 0.75 migration rate split between all tissues
mm_11="data/migration_matrices/migration_matrix_99.csv" ## 0.99 migration rate split between all tissues
mm_array=(${mm_1} ${mm_2} ${mm_3} ${mm_4} ${mm_5} ${mm_6} ${mm_7} ${mm_8} ${mm_9} ${mm_10} ${mm_11})
if [[ "$running_machina" = false ]]; then
# Setup headers for recording the input parameters in a csv
mr_header="mutation_rate"
ss_header="sample_size"
# Determine the maximum length of the arrays for input parameter csv writing
max_len=${#mm_array[@]}
if [ ${#mr_array[@]} -gt $max_len ]; then
max_len=${#mr_array[@]}
fi
if [ ${#ss_array[@]} -gt $max_len ]; then
max_len=${#ss_array[@]}
fi
# Write the array values to a CSV file
echo "${mr_header},${ss_header}" > input_ranges_${parallel_sim_name}.csv
for ((i=0; i<$max_len; i++));
do
mr=${mr_array[$i]:-}
mr=${mr//,/ }
ss=${ss_array[$i]:-}
echo "${mr},${ss}" >> input_ranges_${parallel_sim_name}.csv
done
# Create an array to store all the commands
commands=()
echo "Setting up all parameter combinations..."
# Loop over all combinations of the three arrays and add the command to the commands array
j=0
for mr in ${mr_array[@]}
do
for ss in ${ss_array[@]}
do
for ((rep=0; rep<100; rep++)) ### Can set the amount of repeat simulations with the same parameters
do
simmid="sim${j}"
cmd="./sim_full_pipeline.sh --evotracer --out ${simmid} --mutrate ${mr} --samples ${ss}"
commands+=("$cmd")
j=$((j+1))
done
done
done
fi
if [[ "$running_machina" = true ]]; then
# Setup headers for recording the input parameters in a csv
mr_header="mutation_rate"
ss_header="sample_size"
mm_header="migration_matrix"
# Determine the maximum length of the arrays for input parameter csv writing
max_len=${#mm_array[@]}
if [ ${#mr_array[@]} -gt $max_len ]; then
max_len=${#mr_array[@]}
fi
if [ ${#ss_array[@]} -gt $max_len ]; then
max_len=${#ss_array[@]}
fi
# Write the array values to a CSV file
echo "${mr_header},${ss_header},${mm_header}" > input_ranges_${parallel_sim_name}.csv
for ((i=0; i<$max_len; i++));
do
mr=${mr_array[$i]:-}
mr=${mr//,/ }
ss=${ss_array[$i]:-}
mm=${mm_array[$i]:-}
echo "${mr},${ss},${mm}" >> input_ranges_${parallel_sim_name}.csv
done
# Create an array to store all the commands
commands=()
echo "Setting up all parameter combinations..."
# Loop over all combinations of the three arrays and add the command to the commands array
j=0
for mm in ${mm_array[@]}
do
for mr in ${mr_array[@]}
do
for ss in ${ss_array[@]}
do
for ((rep=0; rep<100; rep++)) ### Can set the amount of repeat simulations with the same parameters
do
simmid="sim${j}"
cmd="./sim_full_pipeline.sh --machina --out ${simmid} --mutrate ${mr} --samples ${ss} --migration ${mm}"
commands+=("$cmd")
j=$((j+1))
done
done
done
done
fi
echo "There are ${#commands[@]} commands to be submitted."
# Create the main output files with the headers. These can be commented out if not running specific packages in the sim_full_pipeline.sh command line optional flag inputs.
echo "name,mutrate,num_samples,num_uniq_mutations,num_uniq_barcodes,num_ASVs,proportion_ASVs_barcodes,avg_alignment_error,num_barcodes_missing,proportion_barcodes_missing,proportion_barcodes_found,num_barcodes_found,proportion_ASVs_barcodes_found" > output_sim_evotracer_${parallel_sim_name}.csv
echo "sim_name,mutrate,num_samples,sim_mutations,count_sim_mutations,evotracer_mutations,count_evotracer_mutations,evo_precision,evo_recall,evo_f1,amplican_mutations,count_amplican_mutations,amp_precision,amp_recall,amp_f1,crispresso2_mutations,count_crispresso2_mutations,crispresso2_precision,crispresso2_recall,crispresso2_f1" > output_simEvoAmpCressoCompare_${parallel_sim_name}.csv
if [["$running_machina" = true]]; then
echo "name,mutrate,num_samples,migration_matrix,num_uniq_mutations,uniq_mut_per_site,total_mut_sites,avg_mut_sites_per_sample,avg_proportion_mut_sites,total_dropout,dropout_per_sample,avg_mutation_age,true_migrations,inferred_migrations,proportion" > output_machina_all_${parallel_sim_name}.csv
echo "machina_sim_name,migration_matrix,true_positives,false_positives,false_negatives,precision,recall,f1_score" > output_statistics_machina_all_${parallel_sim_name}.csv
fi
par_results="${parallel_sim_name}_parallel_sim_results"
mkdir ${par_results}
par_results_data="${par_results}/data"
mkdir ${par_results_data}
# Submit the commands in batches of 20 using ParaFly
batch_size=20
num_batches=$(((${#commands[@]} / $batch_size) + 1))
echo "Beginning submission of all combinations in parallel batches of ${batch_size}..."
conda activate machina
for ((i=0; i<$num_batches; i++))
do
echo "Submitting batch $((i+1))/$((num_batches)):"
# Calculate the range of commands to submit in this batch
start_index=$((i * batch_size))
end_index=$((start_index + batch_size - 1))
# Check if the end index is greater than or equal to the length of the commands array
if [ $end_index -ge ${#commands[@]} ]; then
batch_commands=("${commands[@]:${start_index}}") # Slice from start index to end of commands array
else
batch_commands=("${commands[@]:${start_index}:${batch_size}}") # Slice from start index to batch_size
fi
for command in "${batch_commands[@]}"
do
echo "${command}" >> "${i}.cmd"
echo "${command}"
done
# Print the ParaFly command to the console
ParaFly -CPU ${batch_size} -c ${i}.cmd
rm ${i}.cmd
rm ${i}.cmd.completed
# Append the second line of each csv file to the main files
for f in sim_results*/comparison_sim*; do tail -n +2 $f >> output_sim_evotracer_${parallel_sim_name}.csv; done
for f in sim_results*/compare_*; do tail -n +2 $f >> output_simEvoAmpCressoCompare_${parallel_sim_name}.csv; done
if [["$running_machina" = true]]; then
for f in sim_results*/comparison_inferred*; do tail -n +2 $f >> output_machina_all_${parallel_sim_name}.csv; done
for f in sim_results*/statistics*; do tail -n +2 $f >> output_statistics_machina_all_${parallel_sim_name}.csv; done
fi
mv sim_results_sim* ${par_results_data}/
#rm -r sim_results*
done
echo "All batches are done."
conda deactivate
mv *${parallel_sim_name}.csv ${par_results}/
### Use below to print all proportions data and remove sim data for use with scipy optimize
### Remove this if not using scipy since this deletes all output and prints to the terminal only what scipy needs
#proportions_scipy=$(tail -n +2 ${par_results}/output_all_${parallel_sim_name}.csv | cut -d ',' -f 15 | sed ':a;N;$!ba;s/\n/,/g')
#echo "proportions = [$proportions_scipy]"
#rm -r ${par_results}/