# Analysis of PMAG results

The following document provides the details of the output from PMAG. PMAG outputs the gene orders of ancestral genomes. Here we compute the following for each run of PMAG:
1. The gene content of ancestral genomes.
2. The adjacency content of ancestral genomes.
We then compute the precision recall statistics for each run. We then compare the performance against the ILP results with $\alpha \in \{0, 0.5, 1\}$. 

The PMAG results can be found at "../PMAG_results/mode/MLGOresult_XX/geneorder.out" where mode $\in\{$with_L, without_LT$\}$ and XX $\in \{1,..,20\}$. 

In order to compute precision and recall, these results are compared against the gene orders of the ZOMBI genomes. The ZOMBI gene orders can be found at "../../../sim/mode/Run_XX/G/Genomes/". 

In [1]:
from IPython.display import Image
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [1]:
import os
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.switch_backend('agg')
%matplotlib inline

In [2]:
def read_file(filename):
    string = open(filename, "r").read()
    string_list = string.split("\n")
    string_list = [line for line in string_list if line and line[0] != '#'] #Read line only if it is nonempty and not a comment.
    return string_list

In [3]:
def get_gene_orders(gene_order_list, orders):
    sp = None
    for line in gene_order_list:
        if line[0] == '>':
            sp = line[1:]
            orders[sp] = []
        else:
            line = line.split(" ")
            if len(line) > 0:
                orders[sp].append(line[:-1])
    return orders

def get_gene_content(genome):
    content = {}
    for chrom in genome:
        for gene in chrom:
            if gene[0] == '-':
                gene = gene[1:]
            if gene in content:
                content[gene] += 1
            else:
                content[gene] = 1
    return content            
          
def get_adjacency_content(genome):
    content = {}
    for chrom in genome:
        i = 0
        for i in range(0, len(chrom) - 1):
            l, r = chrom[i], chrom[i+1]
            #print(l, r)
            l_ext = (l[1:],'t') if l[0] == '-' else (l,'h')
            r_ext = (r[1:],'h') if r[0] == '-' else (r,'t')
            #print(l_ext, r_ext)
            adj = (l_ext, r_ext)
            if adj in content:
                content[adj] += 1
            elif adj[::-1] in content:
                content[adj[::-1]] += 1
            else:
                content[adj] = 1
            i+=1    
    return content            

In [12]:
PMAG_results = "../PMAG_results"
modes = ["with_L", "without_LT", "without_LT_high_rearr"]
PMAG_overall = {}

failed_runs = []
gene_orders = {}
gene_content = {}
adjacency_content = {}
for mode in modes:
    gene_orders[mode] = {}
    gene_content[mode] = {}
    adjacency_content[mode] = {}
    for run in range(1,21):
        gene_orders[mode][run] = {}
        gene_content[mode][run] = {}
        adjacency_content[mode][run] = {}
        gene_order_file = os.path.join(PMAG_results, mode, "MLGOresult_"+str(run), "geneorder.out")
        gene_order_list = read_file(gene_order_file)
        
        if len(gene_order_list) <= 8:
            failed_runs.append((mode, run))
        else:
            gene_orders[mode][run] = get_gene_orders(gene_order_list, gene_orders[mode][run])
            for sp in gene_orders[mode][run]:
                gene_content[mode][run][sp] = {}
                gene_content[mode][run][sp] = get_gene_content(gene_orders[mode][run][sp])
                
                adjacency_content[mode][run][sp] = {}
                adjacency_content[mode][run][sp] = get_adjacency_content(gene_orders[mode][run][sp])
                #print(mode, run, sp, len(adjacency_content[mode][run][sp]))
        #print("\n")        
        


In [13]:
def get_ZOMBI_orders(order_list, order):
    order = []
    for line in order_list:
        line = line.split("\t")
        gfam, orient = line[1], line[2]
        #print(line)
        if orient == "-":
            gene = orient+gfam
        else:
            gene = gfam
        #print(gfam, orient, gene)    
        order.append(gene)
    return order    

In [14]:
ZOMBI_folder = "../../../sim"

ZOMBI_gene_orders = {}
ZOMBI_gene_content = {}
ZOMBI_adjacency_content = {}
for mode in modes:
    ZOMBI_gene_orders[mode] = {}
    ZOMBI_gene_content[mode] = {}
    ZOMBI_adjacency_content[mode] = {}
    for run in range(1,21):
        #print(run)
        A = (mode,run)
        if A not in failed_runs:
            ZOMBI_gene_orders[mode][run] = {}
            ZOMBI_gene_content[mode][run] = {}
            ZOMBI_adjacency_content[mode][run] = {}
            for sp in gene_content[mode][run]:  
                #print(sp)
                gene_order_file = os.path.join(ZOMBI_folder, mode, "Run_"+str(run), "G", "Genomes", sp+"_GENOME.tsv")
                gene_order_list = read_file(gene_order_file)
                #print(gene_order_list)    
                ZOMBI_gene_orders[mode][run][sp] = get_ZOMBI_orders(gene_order_list[1:], gene_orders[mode][run][sp])
                #for sp in gene_orders[mode][run]:
                ZOMBI_gene_content[mode][run][sp] = {}
                ZOMBI_gene_content[mode][run][sp] = get_gene_content(ZOMBI_gene_orders[mode][run][sp])
                #print(ZOMBI_gene_orders[mode][run][sp])
                ZOMBI_adjacency_content[mode][run][sp] = {}
                ZOMBI_adjacency_content[mode][run][sp] = get_adjacency_content([ZOMBI_gene_orders[mode][run][sp]])   



In [15]:
precision, recall, F1_score = {}, {}, {}
avg_prec, avg_rec, avg_F1 = {}, {}, {}
stats = {}
for mode in modes:
    avg_prec[mode], avg_rec[mode], avg_F1[mode] = 0, 0, 0
    stats[mode] = []
    valid_runs = 0
    precision[mode], recall[mode], F1_score[mode] = {}, {}, {}
    for run in range(1,21):
        precision[mode][run], recall[mode][run], F1_score[mode][run] = None, None, None 
        pair = (mode, run)
        if pair not in failed_runs:
            valid_runs += 1
            count = 0
            total_count = 0
            ZOMBI_count = 0
            for sp in adjacency_content[mode][run]:
                for adj in adjacency_content[mode][run][sp]:
                    if adj in ZOMBI_adjacency_content[mode][run][sp]:
                        count += min(adjacency_content[mode][run][sp][adj], ZOMBI_adjacency_content[mode][run][sp][adj])
                        total_count += adjacency_content[mode][run][sp][adj]
                    elif adj[::-1] in ZOMBI_adjacency_content[mode][run][sp]:
                        count += min(adjacency_content[mode][run][sp][adj], ZOMBI_adjacency_content[mode][run][sp][adj[::-1]])
                        total_count += adjacency_content[mode][run][sp][adj]
                    else:
                        total_count += adjacency_content[mode][run][sp][adj]
                for adj in ZOMBI_adjacency_content[mode][run][sp]:
                    ZOMBI_count += ZOMBI_adjacency_content[mode][run][sp][adj]
            precision[mode][run] = count/total_count
            recall[mode][run] = count/ZOMBI_count
            F1_score[mode][run] = 2*precision[mode][run]*recall[mode][run]/(precision[mode][run]+recall[mode][run])
            
            avg_prec[mode] += precision[mode][run]
            avg_rec[mode] += recall[mode][run]
            avg_F1[mode] += F1_score[mode][run]
            
        stats[mode].append([run, precision[mode][run], recall[mode][run], F1_score[mode][run]])
        
        #print(mode, run, precision[mode][run], recall[mode][run], F1_score[mode][run])    
    avg_prec[mode] = avg_prec[mode]/valid_runs
    avg_rec[mode] = avg_rec[mode]/valid_runs
    avg_F1[mode] = avg_F1[mode]/valid_runs
    #print(mode, avg_prec[mode], avg_rec[mode], avg_F1[mode])
    stats[mode].append(["Overall", avg_prec[mode], avg_rec[mode], avg_F1[mode]])
    PMAG_overall[mode] = ["PMAG", avg_prec[mode], avg_rec[mode], avg_F1[mode]]
    stats[mode] = pd.DataFrame(stats[mode])
    stats[mode].rename(columns = {0: 'Run', 1: 'Precision', 2: 'Recall', 3: 'F1 score'}, inplace = True)
    #stats[mode] = stats[mode].sort_values(by=['Run'])

### With losses

PMAG/MLGO was implemented to completion on 18 of the 20 inputs. It performs better than the SPP ILP for the most part with a recall of at least 0.7 consistently and a precision of at least 0.75 for most of the runs. The SPP ILP outperforms PMAG in precision, having a precision of at least 0.9 for lower values of $\alpha$. Considering the F1 score, only the case $\alpha=0.5$ (F1 score $= 0.78$) is able to provide some competition for PMAG when gene losses are considered.

In [16]:
stats['with_L']

Unnamed: 0,Run,Precision,Recall,F1 score
0,1,0.077626,0.083641,0.080521
1,2,0.090566,0.093023,0.091778
2,3,0.088028,0.102041,0.094518
3,4,0.125436,0.134663,0.129886
4,5,0.135405,0.137698,0.136542
5,6,0.881563,0.888069,0.884804
6,7,0.829268,0.826087,0.827675
7,8,0.864407,0.867558,0.865979
8,9,0.898876,0.8933,0.89608
9,10,0.884041,0.873016,0.878494


### Without losses

Once again PMAG/MLGO was implemented to completion for 18 out of 20 inputs. PMAG consistently provided a precision of 0.74 and recall of 0.8. However, in this case, the SPP ILP was able to provide better statistics for $\alpha=0.5$ and $0.75$. While the recall for $\alpha=0.75$ was better than other choices for the parameter, the average F1 scores for $\alpha \in \{0.25, 0.5, 0.75\}$ was better than that for PMAG. 

In [17]:
stats['without_LT']

Unnamed: 0,Run,Precision,Recall,F1 score
0,1,,,
1,2,0.764548,0.816444,0.789644
2,3,0.792982,0.832413,0.812219
3,4,0.805629,0.841912,0.823371
4,5,0.801105,0.83815,0.819209
5,6,0.788445,0.831541,0.80942
6,7,0.793814,0.847706,0.819876
7,8,0.747547,0.80888,0.777005
8,9,0.795477,0.829744,0.812249
9,10,0.805683,0.836346,0.820728


In [18]:
stats['without_LT_high_rearr']

Unnamed: 0,Run,Precision,Recall,F1 score
0,1,0.008264,0.008604,0.008431
1,2,0.010516,0.010628,0.010572
2,3,0.009634,0.010121,0.009872
3,4,0.008547,0.008555,0.008551
4,5,0.016791,0.017946,0.017349
5,6,0.007,0.007431,0.007209
6,7,0.005666,0.00565,0.005658
7,8,0.013672,0.013527,0.013599
8,9,0.007851,0.008016,0.007933
9,10,0.013035,0.014403,0.013685


# Weighted adjacencies ILP results

In this experiment, we will use the following as input.
1. Species tree (generated by ZOMBI)
2. Gene order for extant genomes (generated by ZOMBI)
3. Gene content for ancestral genomes (obtained through ancestral gene orders generated by ZOMBI)
4. Reconciled gene trees for each gene family (generated by ZOMBI)

Here, we use DeClone for sampling and obtaining adjacency weights. This experiment adds some noise to the reconstruction by using a larger set of potential adjacencies with non-uniform weights.

In this experiment, the species tree topology is the same for all runs of the ILP.

### Experiment set up

#### Simulations

We use two sets of simulations:
1. Without loss and low rates of rearrangement (../../../sim/without_LT)
2. With gene loss as an event and low rates of rearrangment (../../../sim/with_L)

For each set of simulations, we have 10 extant genomes. The Root genome contains 100 gene families. Each set has 200 duplications over all branches of the species tree. The rate of rearrangement for set 1 is 100 inversions and 100 translocations over all branches of the species tree. For set 2, the rate of gene loss is 100 genes over all branches. The exact parameters for the sets can be found under the directory "../../../code/ZOMBI_old" in the folders "no_loss_params" and "with_loss_params" respectively.  

While generating the species tree, the option for extinction of species has been muted. Also, the option for horizontal gene transfer has also been muted for this experiment.

The adjacencies have been obtained through two runs of DeClone, one with temperatue 0.1 and another with temperature 1 in order to observe the effects of the number of candidate adjacencies. 

### ILP

The ILP is run with the linearization parameter $\alpha \in \{0, 0.25, 0.5, 0.75, 1\}$ for each of the 20 runs. For each combination (Run, $\alpha$), we compare the adjacency sets provided by the ILP to the true adjacencies from ZOMBI genomes. We generate precision-recall statistics and compare them for each $\alpha$ value. For one solution selected by the ILP, we also output the gene order for each species and the cuts and joins involved for the solution.

In [43]:
with_L_1 = "../output/with_L/temp=1"
with_L_01 = "../output/with_L/temp=0.1"
no_LT_01 = "../output/without_LT/temp=0.1"
no_LT_1 = "../output/without_LT/temp=1"
no_LT_hr_01 = "../output/without_LT_high_rearr/temp=0.1"
no_LT_hr_1 = "../output/without_LT_high_rearr/temp=1"

prec, rec, F1 = {}, {}, {}
dist, cuts, joins , dups, obs = {}, {}, {}, {}, {}
b_dist, b_cuts, b_joins , b_dups, b_obs = {}, {}, {}, {}, {}

d_per_branch = {}
dist_scores = {}

mean = {}
mean_scores = {}

In [20]:
def update_dict(line, stat_dict, alpha):
    stat = line.split(" ")[-1]
    stat_dict[alpha].append(float(stat))
    return stat_dict

def append_dist(line, d1, d2, d3, d4, d5, alpha):
    #s1, s2, s3, s4, s5 = line.split("\t")[1], line.split("\t")[2], line.split("\t")[3], line.split("\t")[4], line.split("\t")[5]
    s1, s2, s3 = line.split("\t")[1], line.split("\t")[2], line.split("\t")[3]
    d1[alpha].append(float(s1))
    d2[alpha].append(float(s2))
    d3[alpha].append(float(s3))
    #d4[alpha].append(float(s4))
    #d5[alpha].append(float(s5))
    return d1, d2, d3, d4, d5

def append_b_dist(line, d1, d2, d3, d4, d5, alpha):
    b = line.split("\t")[0]
    #print(b)
    s1, s2, s3, s4, s5 = line.split("\t")[1], line.split("\t")[2], line.split("\t")[3], line.split("\t")[4], line.split("\t")[5]

    if b not in d1[alpha]:
        d1[alpha][b], d2[alpha][b], d3[alpha][b], d4[alpha][b], d5[alpha][b] = {}, {}, {}, {}, {}
        d1[alpha][b], d2[alpha][b], d3[alpha][b], d4[alpha][b], d5[alpha][b] = [], [], [], [], []
    d1[alpha][b].append(float(s1))
    d2[alpha][b].append(float(s2))
    d3[alpha][b].append(float(s3))
    d4[alpha][b].append(float(s4))
    d5[alpha][b].append(float(s5))
    return d1, d2, d3, d4, d5


In [21]:
def get_stats(case):
    
    idx = case.split("/")[-1]
    print(idx)
    prec[idx], rec[idx], F1[idx] = defaultdict(list), defaultdict(list), defaultdict(list)
    dist[idx], cuts[idx], joins[idx], dups[idx], obs[idx] = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
    b_dist[idx], b_cuts[idx], b_joins[idx], b_dups[idx], b_obs[idx] = defaultdict(dict), defaultdict(dict), defaultdict(dict), defaultdict(dict), defaultdict(dict)

    files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(case) for f in filenames if "stats" in f]
    for file in files:
        #print(file.split("/")[4].split("_"))
        run = file.split("/")[4].split("_")[1]
        alpha = file.split("/")[4].split("_")[-1]
        with open(file, 'r') as f:
            for line in f:
                if "Precision" in line:
                    prec[idx] = update_dict(line, prec[idx], alpha)
                elif "Recall" in line:
                    rec[idx] = update_dict(line, rec[idx], alpha)
                elif "F1_score:" in line:
                    F1[idx] = update_dict(line, F1[idx], alpha)
                elif "(" in line and "None" not in line:
                    if len(line.split("\t")) > 3:
                        b_dist[idx], b_cuts[idx], b_joins[idx], b_dups[idx], b_obs[idx] = append_b_dist(line, b_dist[idx], b_cuts[idx], b_joins[idx], b_dups[idx], b_obs[idx], alpha)

                elif "Overall" in line:
                    dist[idx], cuts[idx], joins[idx], dups[idx], obs[idx] = append_dist(line, dist[idx], cuts[idx], joins[idx], dups[idx], obs[idx], alpha)

    d_per_branch[idx] = {}
    dist_scores[idx] = {}
    for alpha in b_dist[idx]:
        d_per_branch[idx][alpha] = {}
        dist_scores[idx][alpha] = []
        for branch in b_dist[idx][alpha]:
            #print(branch)
            d_per_branch[idx][alpha][branch] = {}
            d_per_branch[idx][alpha][branch]['dist'] = sum(b_dist[idx][alpha][branch])/len(b_dist[idx][alpha][branch])
            d_per_branch[idx][alpha][branch]['cuts'] = sum(b_cuts[idx][alpha][branch])/len(b_dist[idx][alpha][branch])
            d_per_branch[idx][alpha][branch]['joins'] = sum(b_joins[idx][alpha][branch])/len(b_dist[idx][alpha][branch])            
            d_per_branch[idx][alpha][branch]['dups'] = sum(b_dups[idx][alpha][branch])/len(b_dist[idx][alpha][branch])
            d_per_branch[idx][alpha][branch]['observed_dups'] = sum(b_obs[idx][alpha][branch])/len(b_dist[idx][alpha][branch])
            dist_scores[idx][alpha].append([branch, d_per_branch[idx][alpha][branch]['dist'], d_per_branch[idx][alpha][branch]['cuts'], d_per_branch[idx][alpha][branch]['joins'],d_per_branch[idx][alpha][branch]['dups'],d_per_branch[idx][alpha][branch]['observed_dups']])
        dist_scores[idx][alpha] = pd.DataFrame(dist_scores[idx][alpha]) 
        dist_scores[idx][alpha].rename(columns = {0: 'branch', 1: 'SCJTDFD', 2: 'Cuts', 3: 'Joins',4:'Dups',5:'Observed'}, inplace = True) 

    mean[idx] = {}
    mean_scores[idx] = []
    for alpha in prec[idx]:
        mean[idx][alpha] = {}
        mean[idx][alpha]['precision'] = sum(prec[idx][alpha])/len(prec[idx][alpha])
        mean[idx][alpha]['recall'] = sum(rec[idx][alpha])/len(rec[idx][alpha])
        mean[idx][alpha]['f1_score'] = sum(F1[idx][alpha])/len(F1[idx][alpha])
        mean_scores[idx].append([alpha, mean[idx][alpha]['precision'], mean[idx][alpha]['recall'], mean[idx][alpha]['f1_score']])

    mean_scores[idx] = pd.DataFrame(mean_scores[idx])
    mean_scores[idx].rename(columns = {0: 'alpha', 1: 'Precision', 2: 'Recall', 3: 'F1 score'}, inplace = True) 
    mean_scores[idx] = mean_scores[idx].sort_values(by=['alpha'])
    
    #print(mean_scores)
    #mean_scores[idx]

## No Loss and Transfer

### Mean statistics for temp = 1

The following table lists the average precision, recall and F1 score over 20 runs. The precision is best for lower values of $\alpha$. On the other hand the recall is progressively lower for lower values of $\alpha$. The best performance occurs at $\alpha=0.5$ with an F1-score of 0.87 followed closely by $\alpha=0.75$ with an F1-score of 0.86. With the exception of $\alpha=1$ the precision is consistently above 0.9.

In [22]:
get_stats(no_LT_1)
idx = no_LT_1.split("/")[-1]

temp=1


In [23]:
mean_scores[idx]

Unnamed: 0,alpha,Precision,Recall,F1 score
3,0.0,0.963329,0.625302,0.757818
1,0.25,0.959467,0.741959,0.836541
2,0.5,0.959418,0.799441,0.871881
4,0.75,0.901772,0.834631,0.866761
0,1.0,0.749914,0.72994,0.739777


In [24]:
prec_1, rec_1, F1_1 = PMAG_overall["without_LT"][1], PMAG_overall["without_LT"][2], PMAG_overall["without_LT"][3]
mean_scores[idx].append({"alpha":'PMAG', "Precision":prec_1, "Recall":rec_1, "F1 score": F1_1},ignore_index=True)

Unnamed: 0,alpha,Precision,Recall,F1 score
0,0,0.963329,0.625302,0.757818
1,0.25,0.959467,0.741959,0.836541
2,0.5,0.959418,0.799441,0.871881
3,0.75,0.901772,0.834631,0.866761
4,1,0.749914,0.72994,0.739777
5,PMAG,0.790318,0.837004,0.812954


### Branch wise distance for $\alpha=0$ and temp $=1$ for without_LT runs

In [25]:
dist_scores[idx][str(0)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",50.05,39.3,7.85,2.9,1.45
13,"(Root,n2)",172.0,0.0,111.0,58.6,28.1
8,"(n1,n3)",19.5,6.45,8.95,3.6,1.55
16,"(n1,n4)",57.85,5.1,30.35,22.45,11.25
7,"(n12,n15)",66.7,4.5,57.4,6.05,3.65
11,"(n12,n16)",67.15,4.3,58.75,6.2,4.15
9,"(n3,n5)",201.45,7.05,140.8,50.7,23.9
10,"(n3,n6)",117.05,5.95,78.9,33.9,17.8
15,"(n4,n7)",73.95,4.5,49.15,18.5,8.35
3,"(n4,n8)",22.5,2.3,18.0,2.25,1.15


### Branch wise distance for $\alpha=0.25$ and temp $=1$ for without_LT runs

In [26]:
dist_scores[idx][str(0.25)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",51.7,39.45,10.25,2.9,1.9
13,"(Root,n2)",172.15,10.55,100.6,58.6,28.1
8,"(n1,n3)",20.75,7.45,9.3,3.6,1.6
16,"(n1,n4)",72.7,10.5,42.3,22.45,12.5
7,"(n12,n15)",66.55,8.15,53.6,6.05,3.65
11,"(n12,n16)",67.2,8.05,55.05,6.2,4.15
9,"(n3,n5)",201.5,18.35,129.55,50.7,23.9
10,"(n3,n6)",118.2,14.9,71.1,33.9,17.8
15,"(n4,n7)",71.15,10.05,41.1,18.5,8.5
3,"(n4,n8)",17.9,4.85,10.85,2.25,1.15


### Branch wise distance for $\alpha=0.5$ and temp $=1$ for without_LT runs

In [27]:
dist_scores[idx][str(0.5)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",27.55,17.5,8.05,2.9,1.9
13,"(Root,n2)",216.65,35.15,120.5,58.6,28.1
8,"(n1,n3)",31.0,19.4,7.6,3.6,1.6
16,"(n1,n4)",72.5,15.85,37.05,22.45,12.65
7,"(n12,n15)",66.7,8.6,53.3,6.05,3.65
11,"(n12,n16)",67.05,8.35,54.6,6.2,4.15
9,"(n3,n5)",207.7,26.85,127.25,50.7,23.9
10,"(n3,n6)",115.45,17.2,65.95,33.9,17.75
15,"(n4,n7)",76.15,17.1,39.05,18.5,8.5
3,"(n4,n8)",20.15,9.05,8.9,2.25,1.15


### Branch wise distance for $\alpha=0.75$ and temp $=1$ for without_LT runs

In [28]:
dist_scores[idx][str(0.75)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",18.55,8.05,9.1,2.9,2.2
13,"(Root,n2)",240.55,48.9,130.65,58.6,28.1
8,"(n1,n3)",39.35,17.9,18.25,3.6,2.0
16,"(n1,n4)",81.9,20.65,43.45,22.45,13.55
7,"(n12,n15)",67.7,10.8,52.1,6.05,3.65
11,"(n12,n16)",68.65,10.85,53.7,6.2,4.15
9,"(n3,n5)",229.4,50.8,125.0,50.7,23.9
10,"(n3,n6)",132.35,31.9,68.75,33.9,18.05
15,"(n4,n7)",84.65,23.6,41.25,18.5,8.6
3,"(n4,n8)",30.4,13.85,14.45,2.25,1.2


### Branch wise distance for $\alpha=1$ and temp $=1$ for without_LT runs

In [29]:
dist_scores[idx][str(1)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",668.15,628.65,38.2,2.9,2.25
13,"(Root,n2)",848.9,650.2,137.7,58.6,28.1
8,"(n1,n3)",738.0,690.9,43.6,3.6,1.85
16,"(n1,n4)",764.95,691.45,54.9,22.45,13.15
7,"(n12,n15)",1875.4,1813.9,56.7,6.05,3.65
11,"(n12,n16)",1880.15,1815.35,60.7,6.2,4.15
9,"(n3,n5)",959.55,775.95,130.0,50.7,23.9
10,"(n3,n6)",863.2,757.85,74.95,33.9,18.7
15,"(n4,n7)",1237.75,1163.85,54.9,18.5,9.0
3,"(n4,n8)",1212.75,1163.4,47.35,2.25,1.25


### Mean statistics for temp=0.1

The performance in presence of limited candidate adjacencies is very poor. Although the precision is always above 0.9, the recall is very low. The performance is increases gradually as we increase $\alpha$ albeit by a very small margin.

In [30]:
get_stats(no_LT_01)
idx = no_LT_01.split("/")[-1]

temp=0.1


In [31]:
mean_scores[idx]

Unnamed: 0,alpha,Precision,Recall,F1 score
3,0.0,0.967348,0.308896,0.467231
2,0.25,0.90434,0.369917,0.524011
1,0.5,0.904552,0.371745,0.525847
4,0.75,0.905128,0.373733,0.52793
0,1.0,0.901608,0.377297,0.530899


In [32]:
prec_01, rec_01, F1_01 = PMAG_overall["without_LT"][1], PMAG_overall["without_LT"][2], PMAG_overall["without_LT"][3]
mean_scores[idx].append({"alpha":'PMAG', "Precision":prec_01, "Recall":rec_01, "F1 score": F1_01},ignore_index=True)

Unnamed: 0,alpha,Precision,Recall,F1 score
0,0,0.967348,0.308896,0.467231
1,0.25,0.90434,0.369917,0.524011
2,0.5,0.904552,0.371745,0.525847
3,0.75,0.905128,0.373733,0.52793
4,1,0.901608,0.377297,0.530899
5,PMAG,0.790318,0.837004,0.812954


### Branch wise distance for $\alpha=0$ and temp $=0.1$ for without_LT runs

In [33]:
dist_scores[idx][str(0)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",28.65,22.45,0.5,2.9,0.05
13,"(Root,n2)",199.4,0.0,138.4,58.6,28.1
8,"(n1,n3)",17.5,1.05,10.15,3.6,0.45
16,"(n1,n4)",48.05,2.15,7.0,22.45,3.0
7,"(n12,n15)",94.2,0.55,88.85,6.05,3.65
11,"(n12,n16)",93.4,0.6,88.7,6.2,4.15
9,"(n3,n5)",202.85,0.55,148.7,50.7,23.9
10,"(n3,n6)",112.25,1.0,61.15,33.9,8.85
15,"(n4,n7)",70.65,0.55,40.5,18.5,3.7
3,"(n4,n8)",23.55,0.8,18.85,2.25,0.3


### Branch wise distance for $\alpha=0.25$ and temp $=0.1$ for without_LT runs

In [34]:
dist_scores[idx][str(0.25)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",62.9,56.95,0.45,2.9,0.15
13,"(Root,n2)",158.3,2.0,95.3,58.6,28.1
8,"(n1,n3)",19.7,1.95,11.45,3.6,0.45
16,"(n1,n4)",63.05,8.6,15.55,22.45,3.0
7,"(n12,n15)",93.65,3.25,85.6,6.05,3.65
11,"(n12,n16)",93.95,3.85,86.0,6.2,4.15
9,"(n3,n5)",197.2,3.2,140.4,50.7,23.9
10,"(n3,n6)",116.2,7.35,58.75,33.9,8.85
15,"(n4,n7)",71.05,2.25,39.2,18.5,3.7
3,"(n4,n8)",20.8,1.75,15.15,2.25,0.3


### Branch wise distance for $\alpha=0.5$ and temp $=0.1$ for without_LT runs

In [35]:
dist_scores[idx][str(0.5)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",62.9,56.95,0.45,2.9,0.15
13,"(Root,n2)",158.35,2.05,95.3,58.6,28.1
8,"(n1,n3)",19.9,1.95,11.65,3.6,0.45
16,"(n1,n4)",63.2,8.65,15.65,22.45,3.0
7,"(n12,n15)",93.9,3.6,85.5,6.05,3.65
11,"(n12,n16)",94.0,4.1,85.8,6.2,4.15
9,"(n3,n5)",197.15,3.3,140.25,50.7,23.9
10,"(n3,n6)",116.65,7.45,59.1,33.9,8.85
15,"(n4,n7)",71.5,2.25,39.65,18.5,3.7
3,"(n4,n8)",20.95,1.8,15.25,2.25,0.3


### Branch wise distance for $\alpha=0.75$ and temp $=0.1$ for without_LT runs

In [36]:
dist_scores[idx][str(0.75)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",62.9,56.95,0.45,2.9,0.15
13,"(Root,n2)",158.35,2.05,95.3,58.6,28.1
8,"(n1,n3)",19.95,1.95,11.7,3.6,0.45
16,"(n1,n4)",63.25,8.65,15.7,22.45,3.0
7,"(n12,n15)",93.95,3.65,85.5,6.05,3.65
11,"(n12,n16)",94.05,4.15,85.8,6.2,4.15
9,"(n3,n5)",197.1,3.3,140.2,50.7,23.9
10,"(n3,n6)",117.5,7.4,60.0,33.9,8.85
15,"(n4,n7)",72.0,2.2,40.2,18.5,3.7
3,"(n4,n8)",20.95,1.8,15.25,2.25,0.3


### Branch wise distance for $\alpha=1$ and temp $=0.1$ for without_LT runs

In [37]:
dist_scores[idx][str(1)].sort_values(by=['branch'])

Unnamed: 0,branch,SCJTDFD,Cuts,Joins,Dups,Observed
5,"(Root,n1)",128.7,121.7,1.5,2.9,0.15
13,"(Root,n2)",308.5,152.1,95.4,58.6,28.1
8,"(n1,n3)",55.4,36.9,12.2,3.6,0.45
16,"(n1,n4)",92.1,37.05,16.15,22.45,3.0
7,"(n12,n15)",258.8,168.7,85.3,6.05,3.65
11,"(n12,n16)",259.2,169.35,85.75,6.2,4.15
9,"(n3,n5)",348.9,155.35,139.95,50.7,23.9
10,"(n3,n6)",198.75,88.65,60.0,33.9,8.85
15,"(n4,n7)",134.95,64.7,40.65,18.5,3.7
3,"(n4,n8)",62.0,42.5,15.6,2.25,0.3


## With loss

When gene loss is allowed as an event, the recall takes a significant drop as compared to the previous case. The precision however is still above 90% for lower values of $\alpha$. The F1 score is the highest for $\alpha=0.5$ and gets progressively power towards either direction of $\alpha=0.5$.

In [38]:
get_stats(with_L_1)
idx = with_L_1.split("/")[-1]

temp=1


In [39]:
prec_1, rec_1, F1_1 = PMAG_overall["with_L"][1], PMAG_overall["with_L"][2], PMAG_overall["with_L"][3]
mean_scores[idx].append({"alpha":'PMAG', "Precision":prec_1, "Recall":rec_1, "F1 score": F1_1},ignore_index=True)

Unnamed: 0,alpha,Precision,Recall,F1 score
0,0,0.987533,0.280466,0.40438
1,0.25,0.977136,0.333979,0.450028
2,0.5,0.945311,0.362714,0.4698
3,0.75,0.802328,0.386527,0.47525
4,1,0.489535,0.337849,0.386053
5,PMAG,0.62039,0.612789,0.616466


In [44]:
get_stats(with_L_01)
idx = with_L_01.split("/")[-1]

temp=0.1


In [45]:
prec_01, rec_01, F1_01 = PMAG_overall["with_L"][1], PMAG_overall["with_L"][2], PMAG_overall["with_L"][3]
mean_scores[idx].append({"alpha":'PMAG', "Precision":prec_01, "Recall":rec_01, "F1 score": F1_01},ignore_index=True)

Unnamed: 0,alpha,Precision,Recall,F1 score
0,0,0.977972,0.358735,0.522054
1,0.25,0.949324,0.432777,0.592571
2,0.5,0.906467,0.481115,0.625898
3,0.75,0.85944,0.509042,0.637059
4,1,0.839318,0.508381,0.630779
5,PMAG,0.62039,0.612789,0.616466


## Without gene loss but high rearrangement

In [24]:
get_stats(no_LT_hr_1)
idx = no_LT_hr_1.split("/")[-1]

temp=1


In [25]:
mean_scores[idx]

Unnamed: 0,alpha,Precision,Recall,F1 score
4,0.0,0.989347,0.149871,0.259944
3,0.25,0.940316,0.278344,0.429051
1,0.5,0.782642,0.357705,0.490809
2,0.75,0.518967,0.453326,0.483919
0,1.0,0.436054,0.431968,0.434001


In [22]:
get_stats(no_LT_hr_01)
idx = no_LT_hr_01.split("/")[-1]

temp=0.1


In [23]:
mean_scores[idx]

Unnamed: 0,alpha,Precision,Recall,F1 score
2,0.0,0.990465,0.112789,0.202195
4,0.25,0.982778,0.179904,0.303835
0,0.5,0.983039,0.182442,0.307446
3,0.75,0.98312,0.183244,0.308591
1,1.0,0.982175,0.183694,0.309193
