In [1]:
import sys
import numpy as np
import gzip
import matplotlib.pyplot as plt

In [2]:
# Define a function to split a genotype matrix into non-overlapping windows.
def genotype_matrix_windows(
        variant_positions,
        polarized_genotype_matrix,
        window_size=500,
        sequence_length=20000000,
):
    # Intialize a dictionary with the start and stop position for each window.
    windows = {}
    index = 1
    for window_start in range(0, int(sequence_length), int(window_size)):
        windows[index] = [window_start, (window_start + window_size)]
        index += 1
    # Locate what window each variant is in.
    # windows dictionary is now: window # (1-40,000) -> [start (0), stop (500)] with optional variant_index ]
    index = 0
    pos = variant_positions[index]
    for key in windows:
        start, stop = windows[key]
        while start <= pos < stop:
            windows[key].append(index)
            index += 1
            if index < len(variant_positions):
                pos = variant_positions[index]
            else:
                break
    return windows

In [3]:
def calc_window_intro_percent(Binned_windows, true_introgression_positions):
    Windows = Binned_windows
    true_intro_pos = true_introgression_positions
    
    # Initializing dictionary of Window Introgression Percentages
    Win_intro_percent = {}
    # Extract the columns into numpy arrays and round.
    # Sorting makes iterating easier. Not changing any start positions. intro_starts is 'official' starting position
    intro_starts = np.sort(np.round(true_intro_pos[:, 0]))
    intro_stops = np.sort(np.round(true_intro_pos[:, 1]))
    intro_sizes = intro_stops - intro_starts
    start_mods = intro_starts % 500
    stop_mods = intro_stops % 500
    start_windows = ((intro_starts - start_mods) / 500) + 1
    stop_windows = ((intro_stops - stop_mods) / 500) + 1
    
    # Initialize all windows to 0% first
    for key in Windows:
        Win_intro_percent[key] = 0.
    
    # For each segment
    for t in range(intro_sizes.shape[0]):
        start_win = int(start_windows[t])
        stop_win = int(stop_windows[t])
        
        # if segment within a single window
        if start_win == stop_win:
            Win_intro_percent[start_win] += intro_sizes[t] / 500
        # if segment not in a single window
        else:
            # all fully introgressed segments
            for f in range(start_win + 1, stop_win):
                Win_intro_percent[f] = 1.
            
            # clean up starting window
            Win_intro_percent[start_win] += (Windows[start_win][1] - intro_starts[t]) / 500
            # clean up stopping window
            if stop_win <= len(Windows):
                Win_intro_percent[stop_win] += (intro_stops[t] - Windows[stop_win][0]) / 500
            
    return Win_intro_percent


In [4]:
# Extracts the observed sequence (binned)
def extract_O(variable_positions, polarized_genotype_matrix, true_introgression_positions, w_threshold, pattern, dxy):

    # load the variant positions
    var_pos = np.loadtxt(variable_positions, delimiter=',')
    # Load the genotype matrix.
    pol_geno_mat = np.loadtxt(polarized_genotype_matrix, dtype=int, delimiter=',')
    # Load the introgressed region dataframe.
    true_intro_pos = np.loadtxt(true_introgression_positions, delimiter=',')
    # set the window threshold, or the proportion of consistent sites necessary to label C
    window_threshold = float(w_threshold)
    # Define what C, a pattern consistent with introgression, would look like.

    # Indexed from 1 - 400
    # Windows is of the format key -> value
    # Window # (1-400) -> [Start position, stop position, (optional var_pos positions)]
    Windows = genotype_matrix_windows(var_pos, pol_geno_mat, window_size=500, sequence_length=20_000_000)
    Wip = calc_window_intro_percent(Windows, true_intro_pos)

    # EXTRACTING OBSERVED SEQUENCE
    # Intialize observed sequence.
    obs_seq = []

    
        
    # TODO: dxy and window_threshold
    

    # Iterate through all the windows by key.
    for key in Windows:
        # Extract the values for the window key.
        window_vals = Windows[key]
        
        # TODO IF TIME: Make a little graph of the distribution of the number of variant sites per window
        
        # Typically Windows[key] starts with [start, stop, ...].
        # If there are 1 or more variants then the length is greater than 2
        if len(window_vals) > 2:
            # Extract variable positions in that window. [2:] excludes start pos and end pos
            variants = np.asarray(window_vals[2:], dtype=np.int32)
            # Subset the genotype matrix for that window.
            window_geno_mat = pol_geno_mat[variants, :]
            # Keeping tally of consistent sites so we determine if the window is above threshold
            c_sites_tally = 0
            total_sites = len(window_vals)-2
            
            c_pattern_a = np.array([0, 0, 1, 1])
            c_pattern_b = np.array([1, 1, 0, 0])
            
            # Checking all of the sites in a single window
            for site in window_geno_mat:
                if pattern == "patterna": #0011
                    # If the C matrix is equal to the windowed matrix declare it consistent.
                    if np.array_equal(c_pattern_a, site):
                        c_sites_tally += 1
                elif pattern == "patternb": #1100
                    if np.array_equal(c_pattern_b, site):
                        c_sites_tally += 1
                elif pattern == "patternc": #0011 or 1100
                    if np.array_equal(c_pattern_a, site) or np.array_equal(c_pattern_b, site):
                        c_sites_tally += 1
                else:
                    print("ERROR: Invalid Pattern")
            
            # Determines the window label
            c_site_proportion = c_sites_tally / total_sites
            if c_site_proportion >= window_threshold:
                print('C')
                print('C site proportion: ' + str(c_site_proportion*100) + '%')
                obs_seq.append('C')
                
            else:
                print('N')
                print('C site proportion: ' + str(c_site_proportion*100) + '%')
                obs_seq.append('N')
                    
            print(window_geno_mat)
            print('---------------')

        # If there are no variants in the window declare in non-consistent.
        else:
            # print('N')
            obs_seq.append('N')



    # Convert the observation sequence list to an array.
    obs_seq_array = np.asarray(obs_seq)

    # print('there are {0} many consistent observations'.format(np.count_nonzero(obs_seq_array == 'C')))
    # print('the consistent observations occur in window(s) {0}'.format(np.where(obs_seq_array == 'C')))
    # print('the run time for generating one observed sequence is {0} minutes'.format((end - start) / float(60)))

    return obs_seq_array, Wip, Windows

In [6]:
var_pos = './sim_data/rep_id_1_var_pos.csv.gz'
geno_mat = './sim_data/rep_id_1_geno_mat.csv.gz'
intro_pos = './sim_data/rep_id_1_intro_pos.csv.gz'
window_threshold = 1.
pattern = "patternc"
dxy = False

In [7]:
extract_O(var_pos, geno_mat, intro_pos, window_threshold, pattern, dxy)

N
C site proportion: 0.0%
[[0 0 0 1]]
---------------
N
C site proportion: 0.0%
[[1 0 0 0]]
---------------
N
C site proportion: 0.0%
[[1 1 1 0]]
---------------
N
C site proportion: 0.0%
[[0 0 0 1]]
---------------
N
C site proportion: 0.0%
[[1 0 0 0]]
---------------
N
C site proportion: 0.0%
[[1 0 0 0]]
---------------
N
C site proportion: 0.0%
[[1 0 0 0]
 [1 0 0 0]]
---------------
N
C site proportion: 0.0%
[[0 0 0 1]
 [1 0 0 0]
 [0 1 1 0]]
---------------
N
C site proportion: 0.0%
[[0 0 0 1]
 [0 0 1 0]]
---------------
N
C site proportion: 0.0%
[[0 1 0 0]
 [0 1 1 0]]
---------------
N
C site proportion: 0.0%
[[0 0 0 1]
 [0 0 1 0]]
---------------
N
C site proportion: 0.0%
[[1 1 1 0]
 [0 1 1 0]]
---------------
N
C site proportion: 0.0%
[[0 1 0 0]]
---------------
N
C site proportion: 0.0%
[[0 0 0 1]]
---------------
N
C site proportion: 0.0%
[[0 0 1 0]]
---------------
N
C site proportion: 0.0%
[[1 1 1 0]]
---------------
N
C site proportion: 0.0%
[[0 1 0 0]]
---------------
N
C s

(array(['N', 'N', 'N', ..., 'N', 'N', 'N'], dtype='<U1'),
 {1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.0,
  27: 0.0,
  28: 0.0,
  29: 0.0,
  30: 0.0,
  31: 0.0,
  32: 0.0,
  33: 0.0,
  34: 0.0,
  35: 0.0,
  36: 0.0,
  37: 0.0,
  38: 0.0,
  39: 0.0,
  40: 0.0,
  41: 0.0,
  42: 0.0,
  43: 0.0,
  44: 0.0,
  45: 0.0,
  46: 0.0,
  47: 0.0,
  48: 0.0,
  49: 0.0,
  50: 0.0,
  51: 0.0,
  52: 0.0,
  53: 0.0,
  54: 0.0,
  55: 0.0,
  56: 0.0,
  57: 0.0,
  58: 0.0,
  59: 0.0,
  60: 0.0,
  61: 0.0,
  62: 0.0,
  63: 0.0,
  64: 0.0,
  65: 0.0,
  66: 0.0,
  67: 0.0,
  68: 0.0,
  69: 0.0,
  70: 0.0,
  71: 0.0,
  72: 0.0,
  73: 0.0,
  74: 0.0,
  75: 0.0,
  76: 0.0,
  77: 0.0,
  78: 0.0,
  79: 0.0,
  80: 0.0,
  81: 0.0,
  82: 0.0,
  83: 0.0,
  84: 0.0,
  85: 0.0,
  86: 0.0,
  87: