# Testing Notebook: Building the FPR/TPR matrices for Baum-Welch performance across multiple simulated reps

---
## Purpose of Notebook
(Predone) Locally download a few sample rep ids and construct two tables for each of them measuring the True Positive Rate and False Positive Rate across iterations of Baum-Welch, facilitating averaging.

In [77]:
# Import packages.
import sys
import numpy as np
import gzip
import math

# Print versions of our libraries.
print('numpy', np.__version__)
print('gzip', np.__version__)
print('math', np.__version__)

numpy 1.22.3
gzip 1.22.3
math 1.22.3


---
## Function Descriptions

---
### `post_convergence_nan()`
#### Purpose:
`post_convergence_nan()` converts all values in a results file after Baum-Welch has converged to np.nan values to faciliate averaging later by starting at the rightmost column and moving left until it encounters a column with a nonzero value.
#### Input:
- `rep_filepath`: a filepath to a rep that needs to have its post-convergence values converted from 0 to np.nan
#### Output:
- `nan_results`: a nparray of the exact same results, but with all post-convergnece 0 values converted to np.nan
- `convergence_index`: the first column in the array containing nan values


# NOTE: Long-term fix is to actually convert the values to Nan in the Hmm function itself

In [78]:
def post_convergence_nan(rep_filepath):
    rep = np.genfromtxt(rep_filepath, 
                    delimiter='\t', 
                   )
    
    convergence_index = -1
    # iterates over each column of the array in reverse order (right to left)
    for c in range (len(rep[0])-1, -1, -1):
        # print(rep[:, c])
        # print(convergence_index)  
        # print(not np.all(rep[:, c]==0))
        if not np.all(rep[:, c]==0) and convergence_index == -1:
            # TODO: set the convergence index as the column to the right of where the first nonzero values are found
            convergence_index = c + 1
    
    # go through the matrix from the convergence_index column to the right and fill with nans
    for n in range(convergence_index, len(rep[0])):
        rep[:, n] = np.nan

    # TODO: convert all values in the array from that column on to np.nan
    return rep

---
### `cross_rep_performance()`
#### Purpose:
`cross_rep_performance()` creates a table measuring the total True Positive Rate of a given iteration of Baum-Welch across simulations.
#### Input:
- `rep_array`: an array of nparrays representing the reps (assuming they have already been converted to nan
#### Output:
- `tpr_array`: an nparray with dimensions `(R X B)`, where `R` is the number of reps being measured and `B` is the Baum-Welch optimization limit set when the HMM was run (in this case 100). The value at each position corresponds to the overall True Positive Rate given by $\gamma$ after `B` many iterations of Baum-Welch. TPR values in columns after convergence will be equivalent to `np.nan`.
- `fpr_array`: same as above, but for the False Positive Rate instead.

In [83]:
def cross_rep_performance(rep_array):
    # NOTE: Should threshold be an input?
    threshold = .9
    
    # Tpr array has dimensions (# of reps X # of max baum-welch iterations)
    num_reps = len(rep_array)
    max_iter = len(rep_array[0][0])-4
    tpr_array = np.zeros((num_reps, max_iter))
    fpr_array = np.zeros((num_reps, max_iter))
    
    for rep in range(0, num_reps):
        for gamma in range(4, max_iter+4):
            # set up the false positives and false negatives for this gamma
            fp = 0  # number of false positives
            fn = 0  # number of false negatives
            tp = 0  # number of true postivies
            tn = 0  # number of true negatives
            # for each window in the total number (should be 40k)
            for w in range(0, len(rep_array[0])):
                # the percentage of true introgression at that window
                true_val = rep_array[rep][w][2]
                # the percentage introgression guessed by the model at that gamma
                gamma_val = rep_array[rep][w][gamma]
                
                # we have to check to see if we're post-convergence, in which case the performance values are not applicable
                post_convergence = False
                if math.isnan(gamma_val):
                    post_convergence = True
                    break

                # Underlying window is partially or completely introgressed
                if 0 < true_val <= 1.:
                    # true positive
                    if gamma_val >= threshold:
                        tp += 1
                    # false negative
                    else:
                        fn += 1
                # Underlying window is not 100% introgressed
                elif true_val == 0:
                    # false positive
                    if gamma_val >= threshold:
                        fp += 1
                    # true negative
                    else:
                        tn += 1
                else:
                    print("Error in eval: window shows introgression percentage below zero or above 1")
            
            # once the loop is over, calculate the TPR/FPR
            # no relevant information for this loop, populate with nans.
            if post_convergence:
                fpr_array[rep][gamma-4] = np.nan
                tpr_array[rep][gamma-4] = np.nan
            # normal case
            else:
                fpr = fp / (fp + tn)
                tpr = tp / (tp + fn)
                fpr_array[rep][gamma-4] = fpr
                tpr_array[rep][gamma-4] = tpr

    return fpr_array, tpr_array

---
### `average_performances()`
#### Purpose:
`average_performances()` will take the average of all True Positive Rates or False Positive Rates of different Baum-Welch iteration steps across reps in order to evaluate the accuracy of the model. It is `np.nan`-agnostic, meaning it will simply ignore the performances of those that have already converged into its averages.
#### Input:
- `performance_rates()`: an nparray with dimensions `(R X B)`, where `R` is the number of reps being measured and `B` is the Baum-Welch optimization limit set when the HMM was run (in this case 100). The value at each position corresponds to the overall False/True Positive Rate given by $\gamma$ after `B` many iterations of Baum-Welch. FPR/TPR values in columns after convergence will be equivalent to `np.nan`.
#### Output:
- `averaged_performances`: an nparray with dimensions `(1 X B)`, where `B` is the Baum-Welch optimization limit set when the HMM was run (in this case 100). The value at each position corresponse to the average False/True Postive Rate measured across all calculated reps.

In [None]:
def average_performances(performance_rates):
    
    
    return averaged_performances

---
## Walkthrough

In [60]:
# (Hardcoded) download the necessary files - reps are represented in nparray form after post_convergence_nan()
rep1 = post_convergence_nan('/Users/briankirz/Documents/GitHub/mentee_research/kirz/site_pattern_hmm/results_testing/local_test_reps/prufer_results_rep_id_1.csv.gz')
rep2 = post_convergence_nan('/Users/briankirz/Documents/GitHub/mentee_research/kirz/site_pattern_hmm/results_testing/local_test_reps/prufer_results_rep_id_2.csv.gz')
rep3 = post_convergence_nan('/Users/briankirz/Documents/GitHub/mentee_research/kirz/site_pattern_hmm/results_testing/local_test_reps/prufer_results_rep_id_3.csv.gz')
rep4 = post_convergence_nan('/Users/briankirz/Documents/GitHub/mentee_research/kirz/site_pattern_hmm/results_testing/local_test_reps/prufer_results_rep_id_4.csv.gz')
# Assemble the test reps into an array
reps = np.array([rep1, rep2, rep3, rep4])


In [84]:
    rep = np.genfromtxt('/Users/briankirz/Documents/GitHub/mentee_research/kirz/site_pattern_hmm/results_testing/local_test_reps/prufer_results_rep_id_4.csv.gz', delimiter='\t')

In [65]:
# print(len(reps[0]))
# print(reps[0].shape)

40000
(40000, 104)


In [68]:
# print(reps[0][0][64])

nan


In [81]:
cross_rep_performance(reps)[0]

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 5.04286435e-05, 5.04286435e-05,
        5.04286435e-05, 5.04286435e-05, 1.00857287e-04, 1.00857287e-04,
        1.00857287e-04, 1.26071609e-04, 7.81643974e-04, 8.82501261e-04,
        1.13464448e-03, 1.71457388e-03, 2.11800303e-03, 2.39536056e-03,
        3.05093293e-03, 3.70650530e-03, 4.10993444e-03, 4.38729198e-03,
        4.56379223e-03, 4.81593545e-03, 5.01765003e-03, 5.54715078e-03,
        6.07665154e-03, 6.35400908e-03, 6.40443772e-03, 6.50529501e-03,
        6.75743822e-03, 6.98436712e-03, 7.03479576e-03, 7.26172466e-03,
        7.53908220e-03, 7.69036813e-03, 7.71558245e-03, 7.84165406e-03,
        7.91729702e-03, 8.06858296e-03, 8.16944024e-03, 8.19465456e-03,
        8.24508321e-03, 8.29551185e-03, 8.39636914e-03, 8.44679778e-03,
        8.64851236e-03, 8.69894100e-03, 8.72415532e-03, 8.74936964e-03,
        8.74936964e-03, 8.74936964e-03, 8.95108422e-03, 9.026727

(4, 40000, 104)