In [1]:
#%matplotlib notebook
import pandas as pd
import numpy as np
from numpy.linalg import eig
import numpy.linalg as l
import csv
import pylab as pl
import matplotlib.pyplot as plt
import copy
import scipy as stats
from scipy import stats
from scipy.stats import linregress
from scipy.stats import t
from scipy.stats import chi2
from scipy.stats import sem
from scipy import optimize
from scipy.optimize import curve_fit
from scipy.linalg import hadamard
from scipy.special import binom
from scipy.stats import ttest_ind_from_stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind_from_stats
from sklearn.manifold import TSNE
from collections import OrderedDict 
import seaborn as sns
import time
from random import random
from random import randint
from random import randrange
from random import gauss
from random import sample
from matplotlib.lines import Line2D
from textwrap import wrap
import os
from os import listdir
from os.path import isfile, join
from collections import OrderedDict
import gzip
from itertools import compress 
from itertools import groupby
from matplotlib.ticker import FormatStrFormatter
import matplotlib.ticker as mtick
import matplotlib.gridspec as gridspec
from matplotlib.ticker import PercentFormatter
import matplotlib.colors as mcolors
import matplotlib.cm as cm
from statsmodels.stats import diagnostic
import statistics
from scipy.stats import gaussian_kde
import mpl_scatter_density
# see https://github.com/astrofrog/mpl-scatter-density
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import Normalize 
from scipy.interpolate import interpn
from sklearn.preprocessing import MinMaxScaler

# Say, "the default sans-serif font is Arial"
plt.rcParams['font.sans-serif'] = "Arial"
# Then, "ALWAYS use sans-serif fonts"
plt.rcParams['font.family'] = "sans-serif"
# Set conditions for legend
plt.rcParams['legend.title_fontsize'] = 7
plt.rcParams['legend.fontsize'] = 7

# Set conditions for labels
plt.rcParams['xtick.labelsize'] = 7
plt.rcParams['ytick.labelsize'] = 7
plt.rcParams['axes.labelsize'] = 7

In [2]:
LOCUS_LIST = ['BUL2','FAS1','MKT1','NCS2','PMA1','RHO5','SCH9','WHI2','AKL1','RPI1','HSL7','SPT7','FRS1']

floci = ['BUL2','FAS1','MKT1','NCS2','PMA1','RHO5','SCH9','WHI2','RPI1','AKL1']
ploidies = ['hap','hom']
envts = ['37C','4NQO','gu','salt','suloc','YPDA']
tps = [7,14,28,42,49]

flocnum = 10
fwt_res = ['L883','G588','D30','H71','S234','G10','P220','L262','E102','S176']
fmut_res = ['F883','A588','G30','L71','C234','S10','S220','S262','D102','P176']

bell = 'chr07_584201_+T_YGR045C_CDS\xa0←_coding\xa0(96/363\xa0nt)'

In [3]:
# Define York regression without correlation function

# Some notes from a while back:
# Using York et al 2004, we can come to a more accurate estimate of the line of best fit that allows for error
# in both Xi and Yi.
# A key assumption of this estimation is that, were it not for error, all points would line up exactly linearly.
# Our data doesn't really allow this assumption, but let's just let it go as a starting point.
# We will term the observed points Xi and Yi respectively, and the least-squares-adjusted points xi and yi, which is
# their expected values.

# The model uses the equation y = a + bx, so a is the y-intercept and b is the slope.
# a and b also will have errors on them, sigma_a^2 and sigma_b^2.
# The general process is this:
# 1) Use known errors in Xi and Yi to properly weight points for the calculations, along with finding any ri for
# correlated error.
# 2) Choose an approximate initial value of b (e.g., what normal linear regression spits out for slope).
# 3) Use these values to find intermediate terms that are useful, like Bi, Ui, Vi, Wi, X_bar, and Y_bar.
# 4) Iterate to find the actual b within some threshold (and the corresponding intermediate terms).
# 5) Calculate a, then figure out xi and yi adjusted values, then figure out the errors, etc.

# This may be the appropriate ri calc? Though note this was used for a ∆s plot in a past life, so not sure.
# # Get the ri values now, which cannot be assumed to be zero in this plot
#for i in np.arange(len(est_table)):
#    est_table['ri'][i] = -1*np.sqrt(est_table['Xerror'][i]**2/(est_table['Xerror'][i]**2+fitness_table['1stderr'][i]**2))


def yorkreg_nocorr(x,y,xerr,yerr,n):
    res = linregress(x,y)
    
    b = res.slope

    # Now prepare a table to store estimates
    est_table = pd.DataFrame(columns=['Xi','Yi','Xerror','Yerror','wXi','wYi','alpha',
                                     'Wi','Ui','Vi','Bi','xi','yi','ui','vi','ri'])
    est_table['Xi'] = x
    est_table['Yi'] = y
    est_table['Xerror'] = xerr
    est_table['Yerror'] = yerr
    # assume correlation between x and y points is zero
    est_table['ri'] = 0

    # We want to make sure we're using consistent data - that things that are NaN in y are NaN in x too, and vice versa.
    # So do that:
    est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Yi'] = np.nan
    est_table.at[est_table.loc[(est_table['Yi'].isnull())].index,'Xi'] = np.nan

    # Now determine the weights of all these Xi and Yi
    if est_table['Xerror'].sum() == 0 and est_table['Yerror'].sum() == 0:
    #if len(est_table.loc[est_table['Xerror'] != 0]) > 0 and len(est_table.loc[est_table['Yerror'] != 0]) > 0:
        est_table['wXi'] = 100000000
        est_table['wYi'] = 100000000
    else:
        est_table['wXi'] = 1/(est_table['Xerror']**2)
        est_table['wYi'] = 1/(est_table['Yerror']**2)
        

    # Calculate alpha (sqrt of the product of wXi and wYi)
    est_table['alpha'] = np.sqrt(est_table['wXi']*est_table['wYi'])

    mybs = [b]

    # time to iterate b calculation until get to "convergence"
    for q in np.arange(n):

        # Calculate this Wi term, which is quite elaborate. Here right now, not including any ri term, though prob ought to.
        est_table['Wi'] = (est_table['wXi']*est_table['wYi'])/(est_table['wXi']+(b**2)*est_table['wYi']-2*b*est_table['ri']*est_table['alpha'])

        # Want to make sure we aren't using Wi values for points that won't exist on the plot
        est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Wi'] = np.nan

        # Calculate X_bar and Y_bar from Xi, Yi, and Wi
        X_bar = np.nansum(est_table['Wi']*est_table['Xi'])/np.nansum(est_table['Wi'])
        Y_bar = np.nansum(est_table['Wi']*est_table['Yi'])/np.nansum(est_table['Wi'])

        # Calculate Ui, Vi, and Bi using X_bar, Y_bar, Xi, and Yi
        est_table['Ui'] = est_table['Xi'] - X_bar
        est_table['Vi'] = est_table['Yi'] - Y_bar
        est_table['Bi'] = est_table['Wi']*((est_table['Ui']/est_table['wYi'])+(b*est_table['Vi']/est_table['wXi'])-((b*est_table['Ui']+est_table['Vi'])*(est_table['ri']/est_table['alpha'])))

        b_array_top = est_table['Wi']*est_table['Bi']*est_table['Vi']
        b_array_bottom = est_table['Wi']*est_table['Bi']*est_table['Ui']

        b_new = np.nansum(b_array_top)/np.nansum(b_array_bottom)
        #print(b_new)
        mybs = np.append(mybs,b_new)
        b = b_new

    # get the y-intercept now
    a = Y_bar - b*X_bar
    
    # calculate adjusted values of xi where xi = X_bar + Bi
    est_table['xi'] = X_bar + est_table['Bi']
    
    # calculate x_bar and ui
    x_bar = np.nansum(est_table['Wi']*est_table['xi'])/np.nansum(est_table['Wi'])
    est_table['ui'] = est_table['xi'] - x_bar
    
    # get the sigma^2_b
    sigma2_b = 1/np.nansum(est_table['Wi']*est_table['ui']**2)
    
    # get the total sum of squares to report
    S = np.nansum(est_table['Wi']*(est_table['Yi'] - b * est_table['Xi'] - a)**2)
    
    return b, a, sigma2_b, S

In [4]:
# Create a version of the above function that returns a and S for a slope set to 1
# Again, assumes no correlation

def york_slope1(x,y,xerr,yerr,n):
    
    # set slope = 1
    b = 1

    # Now prepare a table to store estimates
    est_table = pd.DataFrame(columns=['Xi','Yi','Xerror','Yerror','wXi','wYi','alpha',
                                     'Wi','Ui','Vi','Bi','xi','yi','ui','vi','ri'])
    est_table['Xi'] = x
    est_table['Yi'] = y
    est_table['Xerror'] = xerr
    est_table['Yerror'] = yerr
    
    # assume correlation between x and y points is zero
    est_table['ri'] = 0

    # We want to make sure we're using consistent data - that things that are NaN in y are NaN in x too, and vice versa.
    # So do that:
    est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Yi'] = np.nan
    est_table.at[est_table.loc[(est_table['Yi'].isnull())].index,'Xi'] = np.nan
    
    # Now determine the weights of all these Xi and Yi
    if est_table['Xerror'].sum() == 0 and est_table['Yerror'].sum() == 0:
    #if len(est_table.loc[est_table['Xerror'] != 0]) > 0 and len(est_table.loc[est_table['Yerror'] != 0]) > 0:
        est_table['wXi'] = 100000000
        est_table['wYi'] = 100000000
    else:
        est_table['wXi'] = 1/(est_table['Xerror']**2)
        est_table['wYi'] = 1/(est_table['Yerror']**2)
        
    # Calculate alpha (sqrt of the product of wXi and wYi)
    est_table['alpha'] = np.sqrt(est_table['wXi']*est_table['wYi'])

    # Calculate this Wi term, which is quite elaborate. Here right now, not including any ri term, though prob ought to.
    est_table['Wi'] = (est_table['wXi']*est_table['wYi'])/(est_table['wXi']+(b**2)*est_table['wYi']-2*b*est_table['ri']*est_table['alpha'])

    # Want to make sure we aren't using Wi values for points that won't exist on the plot
    est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Wi'] = np.nan

    # Calculate X_bar and Y_bar from Xi, Yi, and Wi
    X_bar = np.nansum(est_table['Wi']*est_table['Xi'])/np.nansum(est_table['Wi'])
    Y_bar = np.nansum(est_table['Wi']*est_table['Yi'])/np.nansum(est_table['Wi'])

    # Calculate Ui, Vi, and Bi using X_bar, Y_bar, Xi, and Yi
    est_table['Ui'] = est_table['Xi'] - X_bar
    est_table['Vi'] = est_table['Yi'] - Y_bar
    est_table['Bi'] = est_table['Wi']*((est_table['Ui']/est_table['wYi'])+(b*est_table['Vi']/est_table['wXi'])-((b*est_table['Ui']+est_table['Vi'])*(est_table['ri']/est_table['alpha'])))

    a_calc = Y_bar - b*X_bar
    best_a = a_calc

    # calculate adjusted values of xi where xi = X_bar + Bi
    est_table['xi'] = X_bar + est_table['Bi']

    # calculate x_bar and ui
    x_bar = np.nansum(est_table['Wi']*est_table['xi'])/np.nansum(est_table['Wi'])
    est_table['ui'] = est_table['xi'] - x_bar

    # get the sigma^2_b
    sigma2_b = 1/np.nansum(est_table['Wi']*est_table['ui']**2)

    # get the sum of total least squares, S
    S = np.nansum(est_table['Wi']*(est_table['Yi'] - b * est_table['Xi'] - a_calc)**2)
    best_S = S

    for atest in np.arange(-1,1,0.001):
        # get the sum of total least squares, S
        testS = np.nansum(est_table['Wi']*(est_table['Yi'] - b * est_table['Xi'] - atest)**2)
        #Sdivn = S/(len(temp)-2)
        if testS < best_S:
            best_S = testS
            best_a = atest
    
    return best_a, best_S


In [5]:
# Create a version of the above function that returns a and S for a slope set to 1
# Again, assumes no correlation

def york_slopeanyb(x,y,xerr,yerr,n,myb):
    
    # set slope = myb
    b = myb

    # Now prepare a table to store estimates
    est_table = pd.DataFrame(columns=['Xi','Yi','Xerror','Yerror','wXi','wYi','alpha',
                                     'Wi','Ui','Vi','Bi','xi','yi','ui','vi','ri'])
    est_table['Xi'] = x
    est_table['Yi'] = y
    est_table['Xerror'] = xerr
    est_table['Yerror'] = yerr
    
    # assume correlation between x and y points is zero
    est_table['ri'] = 0

    # We want to make sure we're using consistent data - that things that are NaN in y are NaN in x too, and vice versa.
    # So do that:
    est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Yi'] = np.nan
    est_table.at[est_table.loc[(est_table['Yi'].isnull())].index,'Xi'] = np.nan

    # Now determine the weights of all these Xi and Yi
    if est_table['Xerror'].sum() == 0 and est_table['Yerror'].sum() == 0:
    #if len(est_table.loc[est_table['Xerror'] != 0]) > 0 and len(est_table.loc[est_table['Yerror'] != 0]) > 0:
        est_table['wXi'] = 100000000
        est_table['wYi'] = 100000000
    else:
        est_table['wXi'] = 1/(est_table['Xerror']**2)
        est_table['wYi'] = 1/(est_table['Yerror']**2)
        
    # Calculate alpha (sqrt of the product of wXi and wYi)
    est_table['alpha'] = np.sqrt(est_table['wXi']*est_table['wYi'])

    # Calculate this Wi term, which is quite elaborate. Here right now, not including any ri term, though prob ought to.
    est_table['Wi'] = (est_table['wXi']*est_table['wYi'])/(est_table['wXi']+(b**2)*est_table['wYi']-2*b*est_table['ri']*est_table['alpha'])

    # Want to make sure we aren't using Wi values for points that won't exist on the plot
    est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Wi'] = np.nan

    # Calculate X_bar and Y_bar from Xi, Yi, and Wi
    X_bar = np.nansum(est_table['Wi']*est_table['Xi'])/np.nansum(est_table['Wi'])
    Y_bar = np.nansum(est_table['Wi']*est_table['Yi'])/np.nansum(est_table['Wi'])

    # Calculate Ui, Vi, and Bi using X_bar, Y_bar, Xi, and Yi
    est_table['Ui'] = est_table['Xi'] - X_bar
    est_table['Vi'] = est_table['Yi'] - Y_bar
    est_table['Bi'] = est_table['Wi']*((est_table['Ui']/est_table['wYi'])+(b*est_table['Vi']/est_table['wXi'])-((b*est_table['Ui']+est_table['Vi'])*(est_table['ri']/est_table['alpha'])))

    a_calc = Y_bar - b*X_bar
    best_a = a_calc

    # calculate adjusted values of xi where xi = X_bar + Bi
    est_table['xi'] = X_bar + est_table['Bi']

    # calculate x_bar and ui
    x_bar = np.nansum(est_table['Wi']*est_table['xi'])/np.nansum(est_table['Wi'])
    est_table['ui'] = est_table['xi'] - x_bar

    # get the sigma^2_b
    sigma2_b = 1/np.nansum(est_table['Wi']*est_table['ui']**2)

    # get the sum of total least squares, S
    S = np.nansum(est_table['Wi']*(est_table['Yi'] - b * est_table['Xi'] - a_calc)**2)
    best_S = S

    for atest in np.arange(-1,1,0.001):
        # get the sum of total least squares, S
        testS = np.nansum(est_table['Wi']*(est_table['Yi'] - b * est_table['Xi'] - atest)**2)
        #Sdivn = S/(len(temp)-2)
        if testS < best_S:
            best_S = testS
            best_a = atest
    
    return best_a, best_S


In [6]:
# First, create a comprehensive list of barcodes

# import filenames to work off of
mypath = os.path.join('C:\\','Users','bakor','Dropbox','CRISPR','CRISPR_10xmer_BFA_data','20210117_6_selection_coefficients')
filenames = [f for f in listdir(mypath) if isfile(join(mypath, f))]

# create a comprehensive list of barcodes, using the tech replicate-separated files
bc_list = []
#for f in np.arange(1):
for f in np.arange(len(filenames)):
    thisfilechar = filenames[f][len(filenames[f])-5:len(filenames[f])-4]
    if thisfilechar == '1' or thisfilechar == '2':
        templ = pd.read_csv('CRISPR_10xmer_BFA_data/20210117_6_selection_coefficients/'+filenames[f],sep='\t')
        templ.drop(templ.tail(8).index,inplace=True)
        bc_list = bc_list + templ['Lineage'].tolist()
    
bc_list = list(OrderedDict.fromkeys(bc_list))

bigt = pd.DataFrame()
bigt['BC'] = bc_list

# Map wells onto BCs

bcwmap = pd.read_csv('20200901_simplemap_BCtorealwell.csv').drop(columns=['BC','BC-rc'])
bcwmap = bcwmap.rename(columns={'BC_noATs':'BC'})
bcwmap = bcwmap.rename(columns={'real_well':'Well'})

bigt = pd.merge(bigt,bcwmap,on='BC',how='left')

# Map on the full genotype info we have
ginfo = pd.read_csv('20200107_gt4.csv')
ginfo = ginfo.drop(columns=['Unnamed: 0'])

ginfo['full_g_bin_10'] = ginfo['full_g_bin_10'].astype(str).str.zfill(10)
ginfo['full_g_bin_13'] = ginfo['full_g_bin_13'].astype(str).str.zfill(13)
ginfo['full_g_bin_10_exp'] = ginfo['full_g_bin_10_exp'].astype(str).str.zfill(10)

bigt = pd.merge(bigt,ginfo,how='left',on='Well')

# Add in blanks column
blankslist = pd.read_csv("20201104_blankslist.csv")
bigt = pd.merge(bigt,blankslist,on="Well",how='left')

# Map on fitness info for each barcode (calc across tech reps)
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        imptab = pd.read_csv('CRISPR_10xmer_BFA_data/20210117_6_selection_coefficients/selection_fa_'+ploidies[p]+'_'+envts[e]+'.txt',
                             header=0,sep='\t',names=['lineage','s_'+ploidies[p]+'-'+envts[e],'stderr(s)_'+ploidies[p]+'-'+envts[e],'f0_array_'+ploidies[p]+'-'+envts[e],'?_'+ploidies[p]+'-'+envts[e]])
        imptab.drop(imptab.tail(8).index,inplace=True)
        imptab = imptab.rename(columns={'lineage':'BC'})
        bigt = pd.merge(bigt,imptab,on='BC',how='left')

# For some reason, all values imported as text. Convert to float
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        bigt['s_'+ploidies[p]+'-'+envts[e]] = bigt['s_'+ploidies[p]+'-'+envts[e]].astype(float)
        bigt['stderr(s)_'+ploidies[p]+'-'+envts[e]] = bigt['stderr(s)_'+ploidies[p]+'-'+envts[e]].astype(float)
        bigt['f0_array_'+ploidies[p]+'-'+envts[e]] = bigt['f0_array_'+ploidies[p]+'-'+envts[e]].astype(float)
        bigt['?_'+ploidies[p]+'-'+envts[e]] = bigt['?_'+ploidies[p]+'-'+envts[e]].astype(float)

# normalize s values to WT (all psWT) fitnesses and remove things beyond an outlier threshold
# don't think I need to do anything to standard errors!

mywt = 'AGTCCTGGTAATTGTT' #corresponds to one of the two all psWT barcodes

s_outlier_threshold = 1
numoutliers2 = []
for ct in np.arange(len(ploidies)):
    for env in np.arange(len(envts)):
        numoutlier = 0
        col = 's_'+ploidies[ct]+'-'+envts[env]
        col2 = 'stderr(s)_'+ploidies[ct]+'-'+envts[env]
        # Skip the sWT normalization!! (new as of 2021.04.17)
        #sWT = bigt.loc[(bigt['BC'] == mywt),'s_'+ploidies[ct]+'-'+envts[env]].values[0]
        #for i in np.arange(len(bigt)):
        #    if pd.isnull(bigt.loc[i,col]):
        #        bigt.at[i,col] = np.nan
        #    else:
        #        bigt.at[i,col] = bigt.loc[i,col] - sWT
        #s_var = np.var(bigt[col])
        #print('s_var = '+str(s_var))
        #print('s range = '+str(bigt[col].max() - bigt[col].min()))
        #plt.scatter(np.arange(len(bigt)),bigt[col].sort_values(),s=5)
        #plt.title('pre: '+col)
        #plt.show()
        #plt.scatter(bigt[col].sort_values(),bigt.loc[bigt[col].sort_values().index,'stderr(s)_'+ploidies[ct]+'-'+envts[env]])
        #plt.title('pre: '+col+' vs '+'stderr(s)_'+ploidies[ct]+'-'+envts[env])
        #plt.show()
        #print('median error is '+str(bigt.loc[bigt[col].sort_values().index,'stderr(s)_'+ploidies[ct]+'-'+envts[env]].median()))


        for i in np.arange(len(bigt)):
            if bigt.loc[i,col2] > s_outlier_threshold:
                bigt.at[i,col] = np.nan
                bigt.at[i,col2] = np.nan
                numoutlier = numoutlier + 1
        numoutliers2 = numoutliers2 + [numoutlier]
        

# Now do the same import and normalization process for different tech reps of the same barcode (split out)
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        for r in np.arange(1,3):
            imptab = pd.read_csv('CRISPR_10xmer_BFA_data/20210117_6_selection_coefficients/selection_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(r)+'.txt',
                                 header=0,sep='\t',names=['lineage','s_'+ploidies[p]+'-'+envts[e]+'-'+str(r),'stderr(s)_'+ploidies[p]+'-'+envts[e]+'-'+str(r),'f0_array_'+ploidies[p]+'-'+envts[e]+'-'+str(r)])
            imptab.drop(imptab.tail(8).index,inplace=True)
            imptab = imptab.rename(columns={'lineage':'BC'})
            bigt = pd.merge(bigt,imptab,on='BC',how='left')
            
# For some reason, all values imported as text. Convert to float
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        for r in np.arange(1,3):
            bigt['s_'+ploidies[p]+'-'+envts[e]+'-'+str(r)] = bigt['s_'+ploidies[p]+'-'+envts[e]+'-'+str(r)].astype(float)
            bigt['stderr(s)_'+ploidies[p]+'-'+envts[e]+'-'+str(r)] = bigt['stderr(s)_'+ploidies[p]+'-'+envts[e]+'-'+str(r)].astype(float)
            bigt['f0_array_'+ploidies[p]+'-'+envts[e]+'-'+str(r)] = bigt['f0_array_'+ploidies[p]+'-'+envts[e]+'-'+str(r)].astype(float)

# normalize s values to WT (all psWT) fitnesses and remove things beyond an outlier threshold
# don't think I need to do anything to standard errors!

mywt = 'AGTCCTGGTAATTGTT' #corresponds to one of the two all psWT barcodes


numoutliers2 = []
for ct in np.arange(len(ploidies)):
    for env in np.arange(len(envts)):
        for r in np.arange(1,3):
            numoutlier = 0
            col = 's_'+ploidies[ct]+'-'+envts[env]+'-'+str(r)
            col2 = 'stderr(s)_'+ploidies[ct]+'-'+envts[env]+'-'+str(r)
           
            for i in np.arange(len(bigt)):
                if bigt.loc[i,col2] > s_outlier_threshold:
                    bigt.at[i,col] = np.nan
                    bigt.at[i,col2] = np.nan
                    numoutlier = numoutlier + 1
            numoutliers2 = numoutliers2 + [numoutlier]

In [None]:
# Let's now get stats on what genotypes we do and do not have after excluding some
HEADS UP TAKES A WHILE TO RUN DEPENDING ON HOW DEEP YOU GO!
# get the unique list of genotypes rep in our dataset
uniqueg = bigt.copy(deep=True)
uniqueg = uniqueg.drop_duplicates(subset='full_g_bin_10').reset_index(drop=True)
len(uniqueg)

# do everything for the first-order first
golist = []
for l in np.arange(len(floci)):
    golist = golist + [[l]]

# convert to list of loci
for item in np.arange(len(golist)):
    newitem = []
    for subitem in np.arange(len(golist[item])):
        newitem = newitem + [floci[golist[item][subitem]]]
    golist[item] = newitem

# check how many genotypes with this combo of mutations
# create list of genotypes for the locus combinations
i = -1
deccombolist = np.arange(2**(i+2))
bincombolist = []
for num in np.arange(len(deccombolist)):
    bincombolist = bincombolist + [str(bin(deccombolist[num]))[2:].zfill(i+2)]

binsummary = pd.DataFrame()
for l in np.arange(i+2):
    binsummary['locus'+str(l)] = [item[l] for item in golist]
for g in np.arange(len(bincombolist)):
    binsummary[bincombolist[g]] = ""

# for each gentoype, figure out how many of that locus combo are present in the dataset
for b in np.arange(len(bincombolist)):
    for row in np.arange(len(binsummary)):
        temp = uniqueg.copy(deep=True)
        for l in np.arange(i+2):
            temp = temp.loc[(temp[binsummary.loc[row,'locus'+str(l)]+'_g-update_bin'] == int(bincombolist[b][l]))]
        binsummary.at[row,bincombolist[b]] = len(temp)

# figure out what % of represented genotypes are in each bucket
for b in np.arange(len(bincombolist)):
    binsummary[bincombolist[b]+'%'] = binsummary[bincombolist[b]]/binsummary[bincombolist].sum(axis=1)

# convert back to list of ints (easier to sort)
for item in np.arange(len(golist)):
    newitem = []
    for subitem in np.arange(len(golist[item])):
        newitem = newitem + [floci.index(golist[item][subitem])]
    golist[item] = newitem

export_csv = binsummary.to_csv(r'20210127_binsummary_'+str(i+2)+'.csv',index=True,header=True)
print(i)

# now do for pairwise and higher order combinations
longlist = longlist + golist
mod = []
#for i in np.arange(0,1):
for i in np.arange(len(floci)-4-1):
    for l in np.arange(len(floci)):
        for j in np.arange(len(golist)):
            newguy = [golist[j],[l]]
            newguy = [item for sublist in newguy for item in sublist]
            mod = mod + [sorted(newguy)]
    mod2 = []
    for item in np.arange(len(mod)):
        nonduplist = OrderedDict.fromkeys(mod[item])
        if len(mod[item]) == len(nonduplist):
            mod2 = mod2 + [mod[item]]
    mod2 = sorted(mod2)
    mod2 = list(mod2 for mod2,_ in groupby(mod2))
    
    # This is our list of a certain order i of locus combos
    golist = copy.deepcopy(mod2)
    mod = []
    mod2 = []
    
    # convert to list of loci
    for item in np.arange(len(golist)):
        newitem = []
        for subitem in np.arange(len(golist[item])):
            newitem = newitem + [floci[golist[item][subitem]]]
        golist[item] = newitem
    
    # check how many genotypes with this combo of mutations
    # create list of genotypes for the locus combinations
    deccombolist = np.arange(2**(i+2))
    bincombolist = []
    for num in np.arange(len(deccombolist)):
        bincombolist = bincombolist + [str(bin(deccombolist[num]))[2:].zfill(i+2)]
    
    binsummary = pd.DataFrame()
    for l in np.arange(i+2):
        binsummary['locus'+str(l)] = [item[l] for item in golist]
    for g in np.arange(len(bincombolist)):
        binsummary[bincombolist[g]] = ""
    
    # for each gentoype, figure out how many of that locus combo are present in the dataset
    for b in np.arange(len(bincombolist)):
        for row in np.arange(len(binsummary)):
            temp = uniqueg.copy(deep=True)
            for l in np.arange(i+2):
                temp = temp.loc[(temp[binsummary.loc[row,'locus'+str(l)]+'_g-update_bin'] == int(bincombolist[b][l]))]
            binsummary.at[row,bincombolist[b]] = len(temp)
    
    # figure out what % of represented genotypes are in each bucket
    for b in np.arange(len(bincombolist)):
        binsummary[bincombolist[b]+'%'] = binsummary[bincombolist[b]]/binsummary[bincombolist].sum(axis=1)
    
    # convert back to list of ints (easier to sort)
    for item in np.arange(len(golist)):
        newitem = []
        for subitem in np.arange(len(golist[item])):
            newitem = newitem + [floci.index(golist[item][subitem])]
        golist[item] = newitem
    export_csv = binsummary.to_csv(r'20210127_binsummary_'+str(i+2)+'.csv',index=True,header=True)
    print(str(i))

In [None]:
# Check if the FRS1, HSL7, and SPT7 mutations appear to affect fitness in different environments
# Start with FRS1 since we should have the most data for this.
# We want to isolate guys with the same genotype except for FRS1.

bigtnona = bigt.loc[(~bigt['full_g_bin_13'].isnull())].reset_index(drop=True)

bigtnona = bigtnona[~bigtnona['full_g_bin_13'].str.contains("2")].reset_index(drop=True)

bcperwell = pd.DataFrame(columns=['bc/well'])
bcperwell['Well'] = list(OrderedDict.fromkeys(bigtnona['Well']))
for i in np.arange(len(bcperwell)):
    bcperwell.at[i,'bc/well'] = len(bigtnona.loc[(bigtnona['Well'] == bcperwell.loc[i,'Well'])])

bigtnona = pd.merge(bigtnona,bcperwell.loc[(bcperwell['bc/well'] == 1)],how='inner',on='Well')

#orphanloci = ['HSL7','SPT7','FRS1']
orphanloci = ['FRS1']
#orphanlocinum = [10,11,12]
orphanlocinum = [12]

for l in np.arange(len(orphanloci)):
    bigtnona['full_g_bin_no'+orphanloci[l]] = bigtnona['full_g_bin_13'].str[:orphanlocinum[l]]+bigtnona['full_g_bin_13'].str[orphanlocinum[l]+1:]

    bigtnonawt = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'WT')].reset_index(drop=True)
    bigtnonamut = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'Mut')].reset_index(drop=True)

    bigtjoin = pd.merge(bigtnonawt,bigtnonamut,how='inner',on='full_g_bin_no'+orphanloci[l])
    print(orphanloci[l]+': length of join table is '+ str(len(bigtjoin)))
    #print(bigtjoin['Well_x'],bigtjoin['Well_y'])

    for p in np.arange(2):
        for e in np.arange(len(envts)):
            devtab = bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_x'] - bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_y']
            print(orphanloci[l]+': avg diff = '+str(np.nanmean(devtab)))
            
            plt.errorbar(bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_x'],bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_y'],
                         xerr=bigtjoin['stderr(s)_'+ploidies[p]+'-'+envts[e]+'_x'],yerr=bigtjoin['stderr(s)_'+ploidies[p]+'-'+envts[e]+'_y'],
                         fmt='o')
            plt.xlim(np.min(bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_x'])-.02,np.max(bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_x'])+.02)
            plt.ylim(np.min(bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_y'])-0.02, np.max(bigtjoin['s_'+ploidies[p]+'-'+envts[e]+'_y'])+0.02)
            plt.xlabel(orphanloci[l]+'WT')
            plt.ylabel(orphanloci[l]+'Mut')
            plt.title(ploidies[p]+'-'+envts[e])
            plt.plot(np.linspace(-1,1),np.linspace(-1,1),color='k')
            plt.show()

In [None]:
# Try the histogram approach now
orphanloci = ['HSL7','SPT7','FRS1','WHI2','AKL1','FAS1','SCH9']

bigtnona = bigt.loc[(~bigt['full_g_bin_13'].isnull())].reset_index(drop=True)

bigtnona = bigtnona.loc[(bigtnona['full_g_match?'] == True)].reset_index(drop=True)

#bigtnona = bigtnona[~bigtnona['full_g_bin_13'].str.contains("2")].reset_index(drop=True)

for l in np.arange(len(orphanloci)):
    if orphanloci[l] == 'HSL7':
        wtt = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'WT')&(bigtnona['AKL1_g-update'] == 'WT')&(bigtnona['RPI1_g-update'] == 'WT')]
        mutt = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'Mut')&(bigtnona['AKL1_g-update'] == 'WT')&(bigtnona['RPI1_g-update'] == 'WT')]
    elif orphanloci[l] == 'SPT7':
        wtt = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'WT')&(bigtnona['AKL1_g-update'] == 'WT')&(bigtnona['RPI1_g-update'] == 'Mut')]
        mutt = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'Mut')&(bigtnona['AKL1_g-update'] == 'WT')&(bigtnona['RPI1_g-update'] == 'Mut')]
    else:
        wtt = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'WT')]
        mutt = bigtnona.loc[(bigtnona[orphanloci[l]+'_g-update'] == 'Mut')]
    for p in np.arange(2):
        for e in np.arange(len(envts)):
            binwidth=0.01
            overdat = bigtnona['s_'+ploidies[p]+'-'+envts[e]]
            mydat = wtt['s_'+ploidies[p]+'-'+envts[e]]
            lmydatwt = len(mydat)
            plt.hist(mydat,weights=np.ones(len(mydat)) / len(mydat),
                     bins=np.arange(min(overdat), max(overdat) + binwidth, binwidth),
                     color='xkcd:cerulean',alpha=0.4)
            mydat = mutt['s_'+ploidies[p]+'-'+envts[e]]
            lmydatmut = len(mydat)
            plt.hist(mydat,weights=np.ones(len(mydat)) / len(mydat),
                     bins=np.arange(min(overdat), max(overdat) + binwidth, binwidth),
                     color='xkcd:orange',alpha=0.4)
            plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
            plt.xlabel('s: WT = blue (n='+str(lmydatwt)+'), Mut = Orange (n='+str(lmydatmut)+')')
            plt.title(orphanloci[l]+' in '+envts[e]+' for '+ploidies[p])
            plt.show()

In [None]:
# Actual code for making the techreps figure
#Takes a while to run, because have to get all the neighbor distances (maybe half an hour)

# Try doing this my own way, since not satisfied with colors yet
# Five nearest neighbors, mean distance, scale with MinMaxScaler().
fig_p = 1
fig_e = 3

x = bigt['s_'+ploidies[fig_p]+'-'+envts[fig_e]+'-1']
y = bigt['s_'+ploidies[fig_p]+'-'+envts[fig_e]+'-2']

mask = ~np.isnan(x) & ~np.isnan(y)
x = x[mask].reset_index(drop=True)
myxerr = bigt['stderr(s)_'+ploidies[fig_p]+"-"+envts[fig_e]+'-1'][mask].reset_index(drop=True)
y = y[mask].reset_index(drop=True)
myyerr = bigt['stderr(s)_'+ploidies[fig_p]+"-"+envts[fig_e]+'-2'][mask].reset_index(drop=True)

# Find distance to all other points
nei = pd.DataFrame()
nei['x'] = x
nei['y'] = y
nei['xerr'] = myxerr
nei['yerr'] = myyerr

#for i in np.arange(0,1):
for i in np.arange(len(nei)):
    focal_x = nei.loc[i,'x']
    focal_y = nei.loc[i,'y']
    temp = pd.DataFrame()
    temp['dist'] = np.sqrt((focal_x-nei['x'])**2+(focal_y-nei['y'])**2)
    temp.at[i,'dist'] = np.nan
    # Get average distance to 5 nearest neighbors
    nei.at[i,'dist'] = temp.sort_values(by='dist',ascending=True).reset_index(drop=True)[:5].mean().values[0]

nei = nei.sort_values(by='dist',ascending=False).reset_index(drop=True)

minima=min(nei['dist'])
maxima=max(nei['dist'])
mynorm = mcolors.Normalize(vmin=minima, vmax=maxima, clip=True)
mapper = cm.ScalarMappable(norm=mynorm, cmap='viridis_r')

x = nei['x']
y = nei['y']
myxerr = nei['xerr']
myyerr = nei['yerr']

normvals = MinMaxScaler().fit_transform(np.array(nei['dist']).reshape(-1,1)).tolist()
nv = []
for i in np.arange(len(normvals)):
    nv = nv + [normvals[i][0]]

mycs = []
for v in nv:
    mycs = mycs + [mapper.to_rgba(v)]

fig,ax = plt.subplots(figsize=(1.4*0.75,1.125*0.75))    

for i in np.arange(len(x)):
    mym,myc,mybars = ax.errorbar(x[i],y[i], c=mycs[i],
                        xerr=myxerr[i], yerr=myyerr[i],
                        ecolor=mycs[i],elinewidth=0.5,fmt='o',ms=1,alpha=0.7)
    [bar.set_alpha(0.2) for bar in mybars]

cbar = fig.colorbar(cm.ScalarMappable(norm = mynorm), ax=ax)
cbar.ax.set_ylabel('Density')
cbar.set_ticks([])
ax.plot(np.linspace(-1,1),np.linspace(-1,1),color='k',lw=0.5)
#ax.set_xlim(np.min(x)-.02,np.max(x)+.02)
#ax.set_ylim(np.min(y)-0.02, np.max(y)+0.02)
ax.set_xlim(-0.7,0.03)
ax.set_ylim(-0.7,0.03)
ax.set_xticks([-0.5,0])
ax.set_yticks([-0.5,0])
ax.set_xlabel('Fitness, tech rep 1')
ax.set_ylabel('Fitness, tech rep 2')
#fig.savefig('msfigs/Fig1/techrepexample_'+ploidies[fig_p]+'-'+envts[fig_e]+'_density_witherr_personalmethod_condensed.pdf',bbox_inches='tight',dpi=1000)
plt.show()
#export_csv = nei.to_csv(r'20210329_techrepdistancestable.csv',index=True,header=True)

In [None]:
# Get proper regressions of these, based on York et al 2004

myiter = 100

techreplr = pd.DataFrame()
row = 0

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):        
        
        techreplr.at[row,'ploidy'] = ploidies[p]
        techreplr.at[row,'envt'] = envts[e]
        
        # get the standard linear regression slope
        mask = ~np.isnan(bigt['s_'+ploidies[p]+'-'+envts[e]+'-1']) & ~np.isnan(bigt['s_'+ploidies[p]+'-'+envts[e]+'-2'])
        
        slope, intercept, r_value, p_value, std_err = linregress(bigt['s_'+ploidies[p]+'-'+envts[e]+'-1'][mask],
                                                                 bigt['s_'+ploidies[p]+'-'+envts[e]+'-2'][mask])
        print('slope = '+str(slope))
        print('intercept = '+str(intercept))
        print('r_value = '+str(r_value))
        
        techreplr.at[row,'dumb_slope'] = slope
        techreplr.at[row,'dumb_intercept'] = intercept
        techreplr.at[row,'r_value'] = r_value
        techreplr.at[row,'r^2'] = r_value**2
        
        # As our first step, set b = slope
        b = slope
        
        # Now prepare a table to store estimates
        est_table = pd.DataFrame(columns=['Xi','Yi','Xerror','Yerror','wXi','wYi','alpha',
                                         'Wi','Ui','Vi','Bi','xi','yi','ui','vi','ri'])
        est_table['Xi'] = bigt['s_'+ploidies[p]+'-'+envts[e]+'-1']
        est_table['Yi'] = bigt['s_'+ploidies[p]+'-'+envts[e]+'-2']
        est_table['Xerror'] = bigt['stderr(s)_'+ploidies[p]+"-"+envts[e]+'-1']
        est_table['Yerror'] = bigt['stderr(s)_'+ploidies[p]+"-"+envts[e]+'-2']
        # assume correlation between x and y points is zero
        est_table['ri'] = 0
        
        # We want to make sure we're using consistent data - that things that are NaN in y are NaN in x too, and vice versa.
        # So do that:
        est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Yi'] = np.nan
        est_table.at[est_table.loc[(est_table['Yi'].isnull())].index,'Xi'] = np.nan

        # Now determine the weights of all these Xi and Yi
        est_table['wXi'] = 1/(est_table['Xerror']**2)
        est_table['wYi'] = 1/(est_table['Yerror']**2)
        
        # Calculate alpha (sqrt of the product of wXi and wYi)
        est_table['alpha'] = np.sqrt(est_table['wXi']*est_table['wYi'])
        
        mybs = [b]
        
        # time to iterate b calculation until get to "convergence"
        for q in np.arange(myiter):

            # Calculate this Wi term, which is quite elaborate. Here right now, not including any ri term, though prob ought to.
            est_table['Wi'] = (est_table['wXi']*est_table['wYi'])/(est_table['wXi']+(b**2)*est_table['wYi']-2*b*est_table['ri']*est_table['alpha'])
            
            # Want to make sure we aren't using Wi values for points that won't exist on the plot
            est_table.at[est_table.loc[(est_table['Xi'].isnull())].index,'Wi'] = np.nan

            # Calculate X_bar and Y_bar from Xi, Yi, and Wi
            X_bar = np.nansum(est_table['Wi']*est_table['Xi'])/np.nansum(est_table['Wi'])
            Y_bar = np.nansum(est_table['Wi']*est_table['Yi'])/np.nansum(est_table['Wi'])

            #print(X_bar)
            #print(Y_bar)

            # Calculate Ui, Vi, and Bi using X_bar, Y_bar, Xi, and Yi
            est_table['Ui'] = est_table['Xi'] - X_bar
            est_table['Vi'] = est_table['Yi'] - Y_bar
            est_table['Bi'] = est_table['Wi']*((est_table['Ui']/est_table['wYi'])+(b*est_table['Vi']/est_table['wXi'])-((b*est_table['Ui']+est_table['Vi'])*(est_table['ri']/est_table['alpha'])))
            
            b_array_top = est_table['Wi']*est_table['Bi']*est_table['Vi']
            b_array_bottom = est_table['Wi']*est_table['Bi']*est_table['Ui']
            
            b_new = np.nansum(b_array_top)/np.nansum(b_array_bottom)
            #print(b_new)
            mybs = np.append(mybs,b_new)
            b = b_new

        #plt.show()
        #plt.plot(mybs)
        #plt.show()
        
        # get the y-intercept now
        a = Y_bar - b*X_bar

        # now we want to get the variances in a and b
        est_table['xi'] = X_bar + est_table['Bi']
        est_table['yi'] = Y_bar + b*est_table['Bi']

        # calculate x_bar and y_bar
        x_bar = np.nansum(est_table['Wi']*est_table['xi'])/np.nansum(est_table['Wi'])
        y_bar = np.nansum(est_table['Wi']*est_table['yi'])/np.nansum(est_table['Wi'])
        
        # calculate ui and vi
        est_table['ui'] = est_table['xi'] - x_bar
        est_table['vi'] = est_table['yi'] - y_bar

        # calculate sigma_b and sigma_a
        wu_array = est_table['Wi']*est_table['ui']**2
        
        sigma_b_2 = 1/np.nansum(wu_array)
        sigma_a_2 = 1/np.nansum(est_table['Wi'])+x_bar**2*sigma_b_2
        
        # now calculate p-value for whether slope is same as 1 using two-sided t-test
        S = np.nansum(est_table['Wi']*(est_table['Yi']-b*est_table['Xi']-a)**2)
        n = len(est_table)-sum(np.isnan(est_table['Wi']))
        stderr_slope = np.sqrt(sigma_b_2)*np.sqrt(S/(n-2))
        o_slope = b
        e_slope = 1
        t_slope = (o_slope - e_slope)/stderr_slope
        
        #print('S = ' + str(S))
        print('proper slope (b) = '+str(b))
        print('proper y-int (a) = '+str(a))
        #print('sigma_b_2 = '+str(sigma_b_2))
        #print('sigma_a_2 = '+str(sigma_a_2))

        #print('standard error of slope = ' +str(stderr_slope))
        #print('t score = ' +str(t_slope))
        
        pval_slope = t.sf(np.abs(t_slope), n-1)*2
        #pval = 1- chi2.cdf(t_slope,n-2)  # Find the p-value
        #print ('p (slope) = ' + str(pval_slope))
        
        # now calculate p-value for whether y-intercept is same as 0 using two-sided t-test
        # Use the same value of S calculated above
        stderr_intercept = np.sqrt(sigma_a_2)*np.sqrt(S/(n-2))
        o_intercept = a
        e_intercept = 0
        t_intercept = (o_intercept - e_intercept)/stderr_intercept
        
        #print('standard error of y-intercept = ' +str(stderr_intercept))
        #print('t score = ' +str(t_intercept))
        
        pval_intercept = t.sf(np.abs(t_intercept), n-1)*2
        #pval = 1- chi2.cdf(t_slope,n-2)  # Find the p-value
        #print ('p (intercept) = ' + str(pval_intercept))
        
        # now draw pretty pictures    
        plt.errorbar(bigt['s_'+ploidies[p]+'-'+envts[e]+'-1'],bigt['s_'+ploidies[p]+'-'+envts[e]+'-2'],
                     xerr=bigt['stderr(s)_'+ploidies[p]+"-"+envts[e]+'-1'], yerr=bigt['stderr(s)_'+ploidies[p]+"-"+envts[e]+'-2'],
                     fmt='o',elinewidth=0.5,color='xkcd:cerulean',alpha=0.3)
        plt.title(ploidies[p]+'-'+envts[e])
        plt.xlim(np.min(bigt['s_'+ploidies[p]+'-'+envts[e]+'-1'])-0.02,np.max(bigt['s_'+ploidies[p]+'-'+envts[e]+'-1'])+0.02)
        plt.ylim(np.min(bigt['s_'+ploidies[p]+'-'+envts[e]+'-2'])-0.02, np.max(bigt['s_'+ploidies[p]+'-'+envts[e]+'-2'])+0.02)
        plt.xlabel('tech rep 1')
        plt.ylabel('tech rep 2')
        plt.plot(np.linspace(-1,1),np.linspace(-1,1),color='k')
        
        plt.grid(linestyle = '--')
                
        x_fit = np.linspace(-1,1,1000)
        #y_fit = slope*x_fit + intercept
        #plt.plot (x_fit, y_fit,'-r')
        y_fit_new = b*x_fit + a
        plt.plot (x_fit,y_fit_new,'y')
        #plt.annotate('not shown = '+ str(len(ob)), xy=((np.nanpercentile(DREarr['0s'][j],99)-np.nanpercentile(DREarr['0s'][j],1)+.06)/50+np.nanpercentile(DREarr['0s'][j],1)-.03,(np.nanpercentile(DREarr['1s'][j],99)+.03)-(np.nanpercentile(DREarr['1s'][j],99)-np.nanpercentile(DREarr['1s'][j],1)+.06)/20),color='tab:gray')
        #plt.savefig("fitness_mediated_"+celltype_envt[x]+"-"+mutnames[j]+".pdf",bbox_inches='tight')
        plt.show()
        print('***************************')
        
        techreplr.at[row,'proper_slope'] = b
        techreplr.at[row,'proper_intercept'] = a
        row = row +1


# Based on bbq paper, (1-R)/(1+R) seems like a suitable way to get a lower bound on variance explained due to measurement error
# So add this column.
techreplr['(1-R)/(1+R)'] = (1-techreplr['r_value']) / (1+techreplr['r_value'])

In [9]:
# Set ourselves up to compare biological replicates, since this will be best place to see brokenness

# Remove wells with multiple barcodes
welltwoplusbcs = []
for b in np.arange(len(bigt)):
    welltwoplusbcs = welltwoplusbcs + [len(bigt[(bigt['Well'] == bigt['Well'][b])])]
bigt['Num BCs in well'] = welltwoplusbcs

bigt1bc = bigt[(bigt['Num BCs in well'] < 2)].reset_index(drop=True)

# Remove blank wells
bigt1bc = bigt1bc.loc[(bigt1bc['blank?'] == 'no')].reset_index(drop=True)

# Remove genotypes for which there is just one well
gjustonewell = []
for g in np.arange(len(bigt1bc)):
    gjustonewell = gjustonewell + [len(bigt1bc[(bigt1bc['full_g_bin_10_exp'] == bigt1bc['full_g_bin_10_exp'][g])])]

bigt1bc['Num wells in genotype'] = gjustonewell

bigt1bc2g = bigt1bc[(bigt1bc['Num wells in genotype'] > 1)].reset_index(drop=True)

# Sort the dataframe by lineage
bigt1bc2g = bigt1bc2g.sort_values(by=['full_g_bin_10_exp']).reset_index(drop=True)

# split into two dataframes, one for each barcode set
bca = bigt1bc2g.iloc[np.arange(0,len(bigt1bc2g),2)].reset_index(drop=True)
bcb = bigt1bc2g.iloc[np.arange(1,len(bigt1bc2g),2)].reset_index(drop=True)



In [None]:
# plot bca vs bcb - this is the basic plots, no fancy coloring

bioreplr = pd.DataFrame()
row = 0

for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        plt.errorbar(bca['s_'+ploidies[p]+'-'+envts[e]],bcb['s_'+ploidies[p]+'-'+envts[e]],
                     xerr=bca['stderr(s)_'+ploidies[p]+"-"+envts[e]], yerr=bcb['stderr(s)_'+ploidies[p]+"-"+envts[e]],
                     fmt='o',elinewidth=0.5,color='k',alpha=0.5)
        plt.title(ploidies[p]+'-'+envts[e])
        plt.xlim(np.min(bca['s_'+ploidies[p]+'-'+envts[e]])-.02,np.max(bca['s_'+ploidies[p]+'-'+envts[e]])+.02)
        plt.ylim(np.min(bcb['s_'+ploidies[p]+'-'+envts[e]])-0.02, np.max(bcb['s_'+ploidies[p]+'-'+envts[e]])+0.02)
        plt.xlabel('bc A')
        plt.ylabel('bc B')
        plt.plot(np.linspace(-1,1),np.linspace(-1,1),color='k')
        
        # get the standard linear regression slope
        mask = ~np.isnan(bca['s_'+ploidies[p]+'-'+envts[e]]) & ~np.isnan(bcb['s_'+ploidies[p]+'-'+envts[e]])
        
        slope, intercept, r_value, p_value, std_err = linregress(bca['s_'+ploidies[p]+'-'+envts[e]][mask],
                                                                 bcb['s_'+ploidies[p]+'-'+envts[e]][mask])
        print('slope = '+str(slope))
        print('intercept = '+str(intercept))
        print('r_value = '+str(r_value))
        
        bioreplr.at[row,'ploidy'] = ploidies[p]
        bioreplr.at[row,'envt'] = envts[e]
        bioreplr.at[row,'dumb_slope'] = slope
        bioreplr.at[row,'dumb_intercept'] = intercept
        bioreplr.at[row,'r_value'] = r_value
        bioreplr.at[row,'r^2'] = r_value**2
        
        plt.show()
        
        row = row +1

In [None]:
# Make a condensed version of the same bioreps figure
# Actual code for making the bioreps figure
fig_p = 1
fig_e = 3

# Try doing this my own way, since not satisfied with colors yet
# Five nearest neighbors, mean distance, scale with MinMaxScaler().
x = bca['s_'+ploidies[fig_p]+'-'+envts[fig_e]]
y = bcb['s_'+ploidies[fig_p]+'-'+envts[fig_e]]

mask = ~np.isnan(x) & ~np.isnan(y)
x = x[mask].reset_index(drop=True)
myxerr = bca['stderr(s)_'+ploidies[fig_p]+"-"+envts[fig_e]][mask].reset_index(drop=True)
y = y[mask].reset_index(drop=True)
myyerr = bcb['stderr(s)_'+ploidies[fig_p]+"-"+envts[fig_e]].reset_index(drop=True)

# Find distance to all other points
nei = pd.DataFrame()
nei['x'] = x
nei['y'] = y
nei['xerr'] = myxerr
nei['yerr'] = myyerr

#for i in np.arange(0,1):
for i in np.arange(len(nei)):
    focal_x = nei.loc[i,'x']
    focal_y = nei.loc[i,'y']
    temp = pd.DataFrame()
    temp['dist'] = np.sqrt((focal_x-nei['x'])**2+(focal_y-nei['y'])**2)
    temp.at[i,'dist'] = np.nan
    # Get average distance to 5 nearest neighbors
    nei.at[i,'dist'] = temp.sort_values(by='dist',ascending=True).reset_index(drop=True)[:5].mean().values[0]

nei = nei.sort_values(by='dist',ascending=False).reset_index(drop=True)
minima=min(nei['dist'])
maxima=max(nei['dist'])
mynorm = mcolors.Normalize(vmin=minima, vmax=maxima, clip=True)
mapper = cm.ScalarMappable(norm=mynorm, cmap='viridis_r')

x = nei['x']
y = nei['y']
myxerr = nei['xerr']
myyerr = nei['yerr']

normvals = MinMaxScaler().fit_transform(np.array(nei['dist']).reshape(-1,1)).tolist()
nv = []
for i in np.arange(len(normvals)):
    nv = nv + [normvals[i][0]]

mycs = []
for v in nv:
    mycs = mycs + [mapper.to_rgba(v)]

fig,ax = plt.subplots(figsize=(1.4*0.75,1.125*0.75))    

for i in np.arange(len(x)):
    mym,myc,mybars = ax.errorbar(x[i],y[i], c=mycs[i],
                        xerr=myxerr[i], yerr=myyerr[i],
                        ecolor=mycs[i],elinewidth=0.5,fmt='o',ms=1,alpha=0.7)
    [bar.set_alpha(0.2) for bar in mybars]

cbar = fig.colorbar(cm.ScalarMappable(norm = mynorm), ax=ax)
cbar.ax.set_ylabel('Density')
cbar.set_ticks([])
ax.plot(np.linspace(-1,1),np.linspace(-1,1),color='k',lw=0.5)
#ax.set_xlim(np.min(x)-.02,np.max(x)+.02)
#ax.set_ylim(np.min(y)-0.02, np.max(y)+0.02)
ax.set_xlim(-0.66,0.02)
ax.set_ylim(-0.66,0.02)
ax.set_xticks([-0.6,-0.4,-0.2,0])
ax.set_yticks([-0.6,-0.4,-0.2,0])
ax.set_xlabel('Fitness, bio rep 1')
ax.set_ylabel('Fitness, bio rep 2')
#fig.savefig('msfigs/Fig1/biorepexample_'+ploidies[fig_p]+'-'+envts[fig_e]+'_density_witherr_personalmethod_condensed.pdf',bbox_inches='tight',dpi=1000)
plt.show()


In [None]:

# Create a scatter plot of R2 values for tech, bio reps

#fig,ax = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True,figsize=(3,1.5),constrained_layout=True)
#for p in np.arange(len(ploidies)):
#    temp = techreplr.loc[techreplr['ploidy'] == ploidies[p]]
#    ax[p].scatter(temp['r^2'],temp['envt'],s=15)

fig,ax = plt.subplots(figsize=(1*0.75,1.125*0.75))
pcolors = ['xkcd:cerulean','xkcd:orange']
for p in np.arange(len(ploidies)):
    temp = techreplr.loc[techreplr['ploidy'] == ploidies[p]]
    ax.scatter(temp['r^2'],temp['envt'],color=pcolors[p],s=10,alpha=0.8)
    ax.set_xlim(0,1.1)
fig.gca().invert_yaxis()
ax.set_xlabel('Tech rep R^2')
ax.set_xticks([0,0.5,1])
ax.set_ylabel('Environment')
#fig.legend(ploidies,ncol=2, loc='lower left',bbox_to_anchor=(0.6,1.2),handletextpad=-0.1,frameon=False,columnspacing=0.3)
fig.savefig('msfigs/Fig1/techrepsumm_v2.pdf',bbox_inches='tight',dpi=1000)
plt.show()

# Now do bio reps
fig,ax = plt.subplots(figsize=(1*0.75,1.125*0.75))
pcolors = ['xkcd:cerulean','xkcd:orange']
for p in np.arange(len(ploidies)):
    temp = bioreplr.loc[bioreplr['ploidy'] == ploidies[p]]
    ax.scatter(temp['r^2'],temp['envt'],color=pcolors[p],s=10,alpha=0.8)
    ax.set_xlim(0,1.1)
fig.gca().invert_yaxis()
ax.set_xlabel('Bio rep R^2')
ax.set_xticks([0,0.5,1])
ax.set_ylabel('Environment')
#fig.legend(ploidies,ncol=1, loc='lower left',bbox_to_anchor=(0.6,0.4),handletextpad=-0.1)
fig.savefig('msfigs/Fig1/biorepsumm_v2.pdf',bbox_inches='tight',dpi=1000)
plt.show()



In [None]:
# Do the stacked bar varexp plot tipping each plot on its side
gowith = ['lasso_v2',11]

varstore = pd.DataFrame()
myind = 0

for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):

        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(gowith[1]-1)+'.txt',
                                     sep='\t',names=['todelete','genotype','coeff','na'],skiprows=2,skip_blank_lines=False)

        data1 = data1.loc[data1.loc[(data1['genotype'].isnull())].index.tolist()[0]+1:,:]

        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        data1 = data1.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)

        data1['numMut'] = data1[floci].sum(axis=1)

        #for o in np.arange(1,2):
        for o in np.arange(1,10+1):
            temp = data1.loc[(data1['numMut'] == o)].sort_values('genotype',ascending=False).reset_index(drop=True)
            vx = sum(temp['coeff']**2)
            varstore.at[myind,'ploidy'] = ploidies[p]
            varstore.at[myind,'envt'] = envts[e]
            varstore.at[myind,'order'] = o
            varstore.at[myind,'var_exp'] = vx
            myind = myind + 1
        
for i in np.arange(len(varstore)):
    varstore.at[i,'%var_exp'] = varstore.loc[i,'var_exp'] / varstore.loc[(varstore['ploidy'] == varstore.loc[i,'ploidy'])&(varstore['envt'] == varstore.loc[i,'envt'])]['var_exp'].sum()

for i in np.arange(len(varstore)):
    if i > 0 and varstore.loc[i-1,'ploidy'] == varstore.loc[i,'ploidy'] and varstore.loc[i-1,'envt'] == varstore.loc[i,'envt']: 
        varstore.at[i,'%var_exp_cum'] = varstore.loc[i,'%var_exp'] + varstore.loc[i-1,'%var_exp_cum']
    else:
        varstore.at[i,'%var_exp_cum'] = varstore.loc[i,'%var_exp']

goodcolors = ['xkcd:rust','xkcd:orange','xkcd:gold','xkcd:kelly green','xkcd:dark sky blue','xkcd:indigo','xkcd:violet','xkcd:steel','xkcd:salmon','xkcd:dark grey']
fig,ax = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True,figsize=(2.5,2),constrained_layout=True)
for p in np.arange(len(ploidies)):
    temp = varstore.loc[(varstore['ploidy'] == ploidies[p])]
    for e in np.arange(len(envts)):
        tempe = temp.loc[(temp['envt'] == envts[e])].reset_index(drop=True)
        my_b = 0
        for o in np.arange(1,11):
            ax[p].barh(e,tempe.loc[(tempe['order'] == o),'%var_exp'],left=my_b,color=goodcolors[o-1])
            my_b = my_b + tempe.loc[(tempe['order'] == o),'%var_exp'].values[0]
    ax[p].set_xlim(0,1.01)
    ax[p].set_yticks([0,1,2,3,4,5])
    ax[p].set_yticklabels(envts)
    #ax[p].set_xticks([0,0.2,0.4,0.6,0.8,1.0])
    ax[p].set_xticks([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0])
    #ax[p].set_xticklabels(['0%','20%','40%','60%','80%','100%'])
ax[1].set_xlabel('Fraction broad sense heritability explained')

leg = fig.legend(np.arange(1,11),loc='lower right',handlelength=1,ncol=5,bbox_to_anchor=(0.825,0.655),title='Order',columnspacing=0.6,handletextpad=0.2,framealpha=0.9)
leg._legend_box.align = 'left'
fig.text(-0.1,0.47,'Environment',rotation='vertical',fontsize=7)
fig.text(-0.04,0.77,'hap',rotation='vertical',fontsize=7)
fig.text(-0.04,0.34,'hom',rotation='vertical',fontsize=7)
fig.gca().invert_yaxis()
fig.savefig('msfigs/Fig2/varexp_stackedbarh.pdf',bbox_inches='tight',dpi=1000)
plt.show()



In [None]:
# try a version of 2E where we flip the axes of the cumulative plot instead
fig,ax = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True,figsize=(1.5,3),constrained_layout=True)
for p in np.arange(len(ploidies)):
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):
        temp = varstoreall.loc[(varstoreall['ploidy'] == ploidies[p])&(varstoreall['envt'] == envts[e])].sort_values(by='%var_exp',ascending=False).reset_index(drop=True)
        my_b = 0
        for i in np.arange(len(temp)): 
            if temp.loc[i,'%var_exp'] > 0:
                my_b = my_b + temp.loc[i,'%var_exp']
                temp.at[i,'cumu'] = my_b
            else:
                break
        temp = temp.loc[temp['cumu'] >= 0]
        ax[p].plot(temp['cumu'],np.arange(len(temp)),alpha=0.7)
    ax[p].set_xlim(0.5,1.05)
    ax[p].set_xticks([0.5,0.6,0.7,0.8,0.9,1.0])
    ax[p].set_xticklabels(['50%','60%','70%','80%','90%','100%'],rotation='vertical',fontsize=7)
    ax[p].set_ylim(0,75)
    #ax[p].set_yticks([0,50,100,150])
    #ax[p].set_yticklabels(['0%','20%','40%','60%','80%','100%'])
leg = fig.legend(envts,loc='lower right',handlelength=0.5,bbox_to_anchor=(0.67,0.32),title='Envt',labelspacing=0.1,borderpad=0.5,handletextpad=0.4)
leg._legend_box.align = 'left'
fig.text(0,0.47,'# terms (by rank)',rotation='vertical',fontsize=7)
#ax[1].set_xlabel('\n'.join(wrap('% variance explained / total epistatic variance',25)))
ax[1].set_xlabel('% total epistatic variance explained')
fig.savefig('msfigs/Fig2/varexp_alleporder_cumu_flip.pdf',bbox_inches='tight',dpi=1000)
plt.show()

# try flipping but with these truncated axes
arturscolors=['#D62928','#F57E20','#2DA048','#2077B5','#9268AC','#8C574C']
fig,ax = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True,figsize=(1.5,2),constrained_layout=True)
for p in np.arange(len(ploidies)):
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):
        temp = varstoreall.loc[(varstoreall['ploidy'] == ploidies[p])&(varstoreall['envt'] == envts[e])].sort_values(by='%var_exp',ascending=False).reset_index(drop=True)
        my_b = 0
        for i in np.arange(len(temp)): 
            if temp.loc[i,'%var_exp'] > 0:
                my_b = my_b + temp.loc[i,'%var_exp']
                temp.at[i,'cumu'] = my_b
            else:
                break
        temp = temp.loc[temp['cumu'] >= 0]
        ax[p].plot(np.arange(len(temp)),temp['cumu'],alpha=0.7,color=arturscolors[e])
    ax[p].set_ylim(0.5,1.05)
    ax[p].set_yticks([0.5,0.6,0.7,0.8,0.9,1.0])
    ax[p].set_yticklabels(['50%','60%','70%','80%','90%','100%'],fontsize=7)
    ax[p].set_xlim(0,75)
    ax[p].set_xticks([0,10,20,30,40,50,60,70])
    #ax[p].set_yticklabels(['0%','20%','40%','60%','80%','100%'])
leg = fig.legend(envts,loc='lower right',ncol=2,handlelength=0.25,bbox_to_anchor=(1.14,0.65),title='Environment',labelspacing=0.1,borderpad=0.5,handletextpad=0.4,columnspacing=0.5)
leg._legend_box.align = 'left'
fig.text(-0.1,0.35,'\n'.join(wrap('% total epistatic variance explained',25)),rotation='vertical',fontsize=7)
ax[1].set_xlabel('# terms (by rank)')
fig.savefig('msfigs/Fig2/varexp_alleporder_cumu_trunc.pdf',bbox_inches='tight',dpi=1000)
plt.show()


In [None]:
# Do a hybrid version of 2A with the two above -- observed in top right, modeled in bottom left

# Takes some time to run, maybe about 10min

# start with observed genotype fitnesses
corrtab = pd.DataFrame()
corrtab['ploidy'] = ""
corrtab['envt'] = ""
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        corrtab[ploidies[p]+'_'+envts[e]] = ""

o=10

myind = 0
for p1 in np.arange(len(ploidies)):
    #print(ploidies[p1])
    for e1 in np.arange(len(envts)):
        #print(envts[e1])
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]
        
        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)
        
        # Since we're just using the obs s, remove the pred columns
        data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)])
        
        # Average genotypes, propagating error
        glist = list(OrderedDict.fromkeys(data1['genotype']))
        
        data2 = pd.DataFrame()
        
        for g in np.arange(len(glist)):
            tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
            if len(tempg) == 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs']
                data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs-err']
            elif len(tempg) > 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'].mean()
                my_svar = statistics.variance(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'])
                my_svar = 0
                mymean_stderr = np.mean(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs-err']**2)
                data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)
        
        data1 = data2
        
        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]
        
        corrtab.at[myind,'ploidy'] = ploidies[p1]
        corrtab.at[myind,'envt'] = envts[e1]
        
        for p2 in np.arange(len(ploidies)):
            #print(ploidies[p2])
            for e2 in np.arange(len(envts)):
                #print(envts[e2])
                if p1 != p2 or e1 != e2:
                    data2 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p2]+'_'+envts[e2]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p2]+'_'+envts[e2]+'_s-pred_'+str(o),ploidies[p2]+'_'+envts[e2]+'_s-obs',ploidies[p2]+'_'+envts[e2]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

                    # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
                    data2 = data2.loc[:data2.loc[(data2['genotype'].isnull())].index.tolist()[0]-1,:]

                    # Binary style for genotype
                    data2['genotype'] = data2['genotype'].astype(int).astype(str).str.zfill(10)

                    # Since we're just using the obs s, remove the pred columns
                    data2 = data2.drop(columns=[ploidies[p2]+'_'+envts[e2]+'_s-pred_'+str(o)])

                    # Average genotypes, propagating error
                    glist = list(OrderedDict.fromkeys(data2['genotype']))

                    data3 = pd.DataFrame()

                    for g in np.arange(len(glist)):
                        tempg = data2.loc[(data2['genotype'] == glist[g])].reset_index(drop=True)
                        if len(tempg) == 1:
                            data3.at[g,'genotype'] = glist[g]
                            data3.at[g,'s'] = tempg.loc[0,ploidies[p2]+'_'+envts[e2]+'_s-obs']
                            data3.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p2]+'_'+envts[e2]+'_s-obs-err']
                        elif len(tempg) > 1:
                            data3.at[g,'genotype'] = glist[g]
                            data3.at[g,'s'] = tempg[ploidies[p2]+'_'+envts[e2]+'_s-obs'].mean()
                            my_svar = statistics.variance(tempg[ploidies[p2]+'_'+envts[e2]+'_s-obs'])
                            my_svar = 0
                            mymean_stderr = np.mean(tempg[ploidies[p2]+'_'+envts[e2]+'_s-obs-err']**2)
                            data3.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                            #data3.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

                    data2 = data3

                    # Create a column for each locus, and for the genotype with that locus removed
                    for l in np.arange(len(floci)):
                        data2[floci[l]] = data1.loc[:,'genotype'].str[l]
                    for l in np.arange(len(floci)):
                        data2['without_'+floci[l]] = data2.loc[:,'genotype'].str[:l] + data2.loc[:,'genotype'].str[l+1:]

                    # Merge data1 and data2
                    data = pd.merge(data1,data2,on='genotype',how='inner')
        
                    res = linregress(data['s_x'],data['s_y'])
                    corrtab.at[myind,ploidies[p2]+'_'+envts[e2]] = res.rvalue
        myind = myind + 1

corrtabhom = corrtab.copy(deep=True)
        
# Now spike in the modeled values
corrtabhap = pd.DataFrame()
corrtabhap['ploidy'] = ""
corrtabhap['envt'] = ""
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        corrtabhap[ploidies[p]+'_'+envts[e]] = ""

myind = 0
for p1 in np.arange(len(ploidies)):
    for e1 in np.arange(len(envts)):
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]
        
        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)
        
        # Since we're just using the predicted s, remove the obs columns
        data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)
        
        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]
        
        corrtabhap.at[myind,'ploidy'] = ploidies[p1]
        corrtabhap.at[myind,'envt'] = envts[e1]
        
        for p2 in np.arange(len(ploidies)):
            for e2 in np.arange(len(envts)):
                if p1 != p2 or e1 != e2:
                    data2 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p2]+'_'+envts[e2]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p2]+'_'+envts[e2]+'_s-pred_'+str(o),ploidies[p2]+'_'+envts[e2]+'_s-obs',ploidies[p2]+'_'+envts[e2]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

                    # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
                    data2 = data2.loc[:data2.loc[(data2['genotype'].isnull())].index.tolist()[0]-1,:]

                    # Binary style for genotype
                    data2['genotype'] = data2['genotype'].astype(int).astype(str).str.zfill(10)

                    # Since we're just using the predicted s, remove the obs columns
                    data2 = data2.drop(columns=[ploidies[p2]+'_'+envts[e2]+'_s-obs',ploidies[p2]+'_'+envts[e2]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)

                    # Create a column for each locus, and for the genotype with that locus removed
                    for l in np.arange(len(floci)):
                        data2[floci[l]] = data1.loc[:,'genotype'].str[l]
                    for l in np.arange(len(floci)):
                        data2['without_'+floci[l]] = data2.loc[:,'genotype'].str[:l] + data2.loc[:,'genotype'].str[l+1:]

                    # Merge data1 and data2
                    data = pd.merge(data1,data2,on='genotype',how='inner')
        
                    res = linregress(data[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)],data[ploidies[p2]+'_'+envts[e2]+'_s-pred_'+str(o)])
                    corrtabhap.at[myind,ploidies[p2]+'_'+envts[e2]] = res.rvalue
        myind = myind + 1
        
for r in np.arange(len(corrtab)):
    for c in np.arange(len(corrtab)):
        if r > c:
            corrtab.at[r,corrtab.columns.tolist()[c+2]] = corrtabhap.loc[r,corrtab.columns.tolist()[c+2]]
        
# Start by making the one plot version of the heatmap. I think this prob will be better than 3 separate plots,
# for both space and concision
# put 1 into 1:1 fields
corrtab = corrtab.fillna(1)
hmdarr = np.asarray(corrtab.iloc[:,2:],dtype='f')

mysc = 2
mypeg = 5/6
fig,ax = plt.subplots(figsize=(mysc,mysc*mypeg))

#ax = sns.heatmap(hmdarr,xticklabels=corrtab['ploidy']+'_'+corrtab['envt'],yticklabels=corrtab.columns.to_list()[2:],center=0,cmap="vlag",vmin=-1,vmax=1)
ax = sns.heatmap(hmdarr,xticklabels=corrtab['envt'],yticklabels=corrtab['envt'],center=0,cmap="vlag",vmin=-1,vmax=1) #,cbar_kws={"ticks":[-1,0,1]}

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

ax.hlines([6], *ax.get_xlim())
ax.vlines([6], *ax.get_ylim())

fig.text(-0.13,0.67,'hap',rotation='vertical',fontsize=7)
fig.text(-0.13,0.3,'hom',rotation='vertical',fontsize=7)
fig.text(0.25,-0.17,'hap',rotation='horizontal',fontsize=7)
fig.text(0.56,-0.17,'hom',rotation='horizontal',fontsize=7)
fig.text(0.95,0.48,'ρ',fontsize=7,style='italic')

#fig.savefig('msfigs/Fig2/genotypecorr_acrossploidiesenvts_v2_observedandmodeled.pdf',bbox_inches='tight',dpi=1000)

plt.show()

In [None]:
# Fig 2B

gowith = ['lasso_v2',11]
megas = pd.DataFrame()

for e in np.arange(len(envts)):
    for p in np.arange(len(ploidies)):
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(gowith[1]-1)+'.txt',
                                     sep='\t',names=['todelete','genotype','s','na'],skiprows=2,skip_blank_lines=False)

        data1 = data1.loc[data1.loc[(data1['genotype'].isnull())].index.tolist()[0]+1:,:]

        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        data1 = data1.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)

        data1['numMut'] = data1[floci].sum(axis=1)
        
        data1 = data1.loc[(data1['numMut'] == 1)].sort_values('genotype',ascending=False).reset_index(drop=True)
        data1['ploidy'] = ploidies[p]
        data1['envt'] = envts[e]
        data1.insert(0,'locus',floci)
        
        # do confidence intervals too
        cis = pd.read_csv('CRISPR_10xmer_BFA_data/9_CI/CI_'+ploidies[p]+'_'+envts[e]+'.txt',
                            sep='\t',names=['numMut','genotype','lower','upper'])
        
        cis['genotype'] = cis['genotype'].astype(int).astype(str).str.zfill(10)
        
        cis = cis.loc[(cis['numMut'] == 1)].sort_values('genotype',ascending=False).reset_index(drop=True)
        
        cis = cis.drop(columns=['numMut'])
        
        data1 = pd.merge(data1,cis,on='genotype',how='left')
        
        megas = megas.append(data1).reset_index(drop=True)

# get the errors ready for errobars (diffs, not absolutes)
megas['lower_mag'] = megas['s'] - megas['lower']
megas['upper_mag'] = megas['upper'] - megas['s']
        
pez = []
for e in np.arange(len(envts)):
    for p in np.arange(len(ploidies)):
        pez = pez + [ploidies[p]+'_'+envts[e]]

fig,ax = plt.subplots(figsize=(2.4,1.2))

#offset loci
offl = []
for l in np.arange(len(floci)):
    offl = offl + [-0.3+0.06*l]

for l in np.arange(len(floci)):
    ax.errorbar(np.arange(0,12)+offl[l],megas.loc[(megas['locus'] == floci[l]),'s'].tolist(),
                yerr = [megas.loc[(megas['locus'] == floci[l]),'lower_mag'].tolist(),
                        megas.loc[(megas['locus'] == floci[l]),'upper_mag'].tolist()],
                alpha=0.7,linestyle='None',marker='.',ms=4)
    #plt.show()
leg = ax.legend(floci,bbox_to_anchor=(-0.05, 1.42), loc='upper left',ncol=5,columnspacing=0.5,handlelength=0.5,handletextpad=0.3,frameon=False)
leg._legend_box.align = 'left'
for line in leg.get_lines():
    line.set_linewidth(1.0)
ax.set_xticks(np.arange(0,12))
ax.set_xticklabels(ploidies*6,rotation = 90)
ax.set_xlabel('Environment',labelpad=16)
ax.set_ylabel('Additive effect',labelpad=-2)
ax.tick_params(axis='x',pad=1)
ax.set_xlim(-0.5,11.5)

# alternate shading for envts
thisalpha = 0.25
ax.autoscale(False)
for i in np.arange(0,12,4):
    ax.fill([i-0.5,i-0.5,i+1.5,i+1.5],[-1,1,1,-1],color='xkcd:light grey',alpha=thisalpha,edgecolor=None,zorder=0)

fig.savefig('msfigs/Fig2/additive-spaghetti_v04.pdf',bbox_inches='tight',dpi=1000)
plt.show()

## ABC vs aBC plots

In [None]:
# all plots, for si

gowith = ['lasso_v2',11]
o = 10

yorkn = 100

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):    
    f0,a0 = plt.subplots(nrows=len(floci), ncols=len(envts), sharex='col', sharey=False,figsize=(14*0.45,20*0.39),constrained_layout=True)
    fjunk,ajunk = plt.subplots(nrows=len(floci), ncols=len(envts), sharex='col', sharey=False,figsize=(20,20),constrained_layout=True)
    
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]
        
        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)
        
        # Since we're just using the obs s, remove the pred columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)])
        
        # Average genotypes, propagating error
        glist = list(OrderedDict.fromkeys(data1['genotype']))
        
        data2 = pd.DataFrame()
        
        for g in np.arange(len(glist)):
            tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
            if len(tempg) == 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
                data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
            elif len(tempg) > 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
                my_svar = statistics.variance(tempg[ploidies[p]+'_'+envts[e]+'_s-obs'])
                my_svar = 0
                mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
                data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)
            
        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data2[floci[l]] = data2.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data2['without_'+floci[l]] = data2.loc[:,'genotype'].str[:l] + data2.loc[:,'genotype'].str[l+1:]
        
        #for l in np.arange(9,10):
        for l in np.arange(len(floci)):
            abc = pd.DataFrame()
            
            tab0 = data2.loc[(data2[floci[l]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l]]]
            tab1 = data2.loc[(data2[floci[l]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l]]]
            
            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])
            #temp['s_diff'] = temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'] - temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x']
            
            # get the x and y limits
            ajunk[l][e].scatter(temp['s_x'],temp['s_y'])
            xs = ajunk[l][e].get_xlim()
            ys = ajunk[l][e].get_ylim()
            
            lower = min(xs[0],ys[0])
            upper = max(xs[1],ys[1])
            
            # plot the data
            markers,caps,bars = a0[l][e].errorbar(temp['s_x'],temp['s_y'],
                                                  xerr = temp['stderr(s)_x'],yerr = temp['stderr(s)_y'],alpha=0.7,
                                                  linestyle='None',elinewidth=0.5,marker='.',ms=0.7)
            
            [bar.set_alpha(0.5) for bar in bars]
            
            # plot x = y
            xlist = np.linspace(lower,upper)
            a0[l][e].plot(xlist,xlist,c='k',lw=1,zorder=0)
            
            # get the regression line and plot it
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],temp['stderr(s)_x'],temp['stderr(s)_y'],yorkn)
            a0[l][e].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:blue',alpha=1,zorder=0,lw=0.7)
            
            a0[l][e].set_xlim(lower,upper)
            a0[l][e].set_ylim(lower,upper)
            
            # Set up some labels
            if e == 0:
                a0[l][e].set_ylabel(floci[l], size=8, fontweight='bold')
            
            if l == len(floci)-1:
                a0[l][e].set_xlabel(envts[e], size=8, fontweight='bold')
                
    f0.align_ylabels()

    f0.text(0.47,-0.01,'$\phi$, S288C allele',size=8,fontweight='bold')

    f0.text(-0.03, 0.5, '$\phi$, alternate allele', va='center', rotation='vertical',size=8, fontweight = 'bold')

    f0.savefig('msfigs/big-multipanel_humane_'+ploidies[p]+'.pdf',bbox_inches='tight',dpi=300)

    plt.show()


In [None]:
# Make the 2 4-figure plots we'll actually be using in our supplement
# First, make the hap 4NQO PMA1 example, which illustrates an "intuitive" reversion

# common plot parameters
mysize = 1
myalpha = 0.5
elw = 0.5
ealpha = 0.15

yorkn=100

gowith = ['lasso_v2',11]
o=10

for p1 in np.arange(0,1):
#for p1 in np.arange(len(ploidies)):
    for e1 in np.arange(1,2):
    #for e1 in np.arange(len(envts)):
        for l1a in np.arange(4,5):
        #for l1a in np.arange(len(floci)):
            # single panel
            fig,ax = plt.subplots(nrows = 2, ncols = 2, sharex=True, sharey='row', figsize=(2.3,2))
            plt.subplots_adjust(wspace=0.5)

            data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Since we're using the observed s, remove the predicted columns
            data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)]).reset_index(drop=True)

            # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
            # represented multiple times.
            glist = list(OrderedDict.fromkeys(data1['genotype']))

            data2 = pd.DataFrame()

            for g in np.arange(len(glist)):
                tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                if len(tempg) == 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs']
                    data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs-err']
                elif len(tempg) > 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'].mean()
                    my_svar = statistics.variance(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'])
                    my_svar = 0
                    mymean_stderr = np.mean(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs-err']**2)
                    data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                    #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

            data1 = data2

            # Create a column for each locus, and for the genotype with that locus removed
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l]
            for l in np.arange(len(floci)):
                data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]


            tab0 = data1.loc[(data1[floci[l1a]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l1a]]]
            tab1 = data1.loc[(data1[floci[l1a]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l1a]]]

            #temp = pd.merge(tab1,tab0,how='inner',on='without_'+floci[l1a])
            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l1a])

            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)

            # DO TOP LEFT
            markers0,caps0,bars0 = ax[0][0].errorbar(temp['s_x'],temp['s_y'],
                        xerr = temp['stderr(s)_x'],
                        yerr = temp['stderr(s)_y'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][0].get_xlim()
            ys = ax[0][0].get_ylim()

            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],
                                 temp['stderr(s)_x'],temp['stderr(s)_y'],
                                 yorkn)
            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][0].set_xlim(xs)
            ax[0][0].set_ylim(ys)

            # DO TOP RIGHT
            markers0,caps0,bars0 = ax[0][1].errorbar(temp['s_y'],temp['s_x'],
                        xerr = temp['stderr(s)_y'],
                        yerr = temp['stderr(s)_x'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][1].get_xlim()
            ys = ax[0][1].get_ylim()

            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_y'],temp['s_x'],
                                 temp['stderr(s)_y'],temp['stderr(s)_x'],
                                 yorkn)
            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][1].set_xlim(xs)
            ax[0][1].set_ylim(ys)

            # DO BOTTOM LEFT
            markers0,caps0,bars0 = ax[1][0].errorbar(temp['s_x'],temp['s_y'] - temp['s_x'],
                        xerr = temp['stderr(s)_x'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][0].get_xlim()
            ys = ax[1][0].get_ylim()

            ax[1][0].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_x'],temp['s_y'] - temp['s_x'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s original full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_x'],temp['s_y'] - temp['s_x'])
            ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s original full slope = '+str(res.slope))

            ax[1][0].set_xlim(xs)
            ax[1][0].set_ylim(ys)

            # DO BOTTOM RIGHT
            markers0,caps0,bars0 = ax[1][1].errorbar(temp['s_y'],temp['s_x'] - temp['s_y'],
                        xerr = temp['stderr(s)_y'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][1].get_xlim()
            ys = ax[1][1].get_ylim()

            ax[1][1].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_y'],temp['s_x'] - temp['s_y'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s reversion full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_y'],temp['s_x'] - temp['s_y'])
            ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s reversion full slope = '+str(res.slope))

            #ax.set_yticks([-0.4,-0.2,0])
            #ax.set_xticks([-0.4,-0.2,0])

            ax[1][1].set_xlim(xs)
            ax[1][1].set_ylim(ys)

            # Set labels, ticks, and limits

            ax[1][0].set_xlabel('\n'.join(wrap('Fitness, PMA1 234S',9)))
            ax[1][1].set_xlabel('\n'.join(wrap('Fitness, PMA1 234C',9)))

            ax[1][0].set_ylabel('\n'.join(wrap('∆$\phi$, PMA1 S234C',11)))
            ax[1][1].set_ylabel('\n'.join(wrap('∆$\phi$, PMA1 C234S',11)),labelpad=5)

            ax[0][0].set_ylabel('\n'.join(wrap('Fitness, PMA1 234C',9)))
            ax[0][1].set_ylabel('\n'.join(wrap('Fitness, PMA1 234S',9)),labelpad=5)

            for i in np.arange(2):
                for j in np.arange(2):
                    ax[i][j].set_xlim(-0.6,0.05)
            
            for i in np.arange(2):
                ax[0][i].set_ylim(-0.6,0.05)
                ax[1][i].set_ylim(-0.36,0.36)
                ax[0][i].set_yticks([-0.5,0])
                ax[1][i].set_yticks([-0.2,0,0.2])
            
            fig.align_ylabels()
            
            #fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v02.pdf',bbox_inches='tight',dpi=3000)
            fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v03_linregressfordelta.pdf',bbox_inches='tight',dpi=3000)

            plt.show()

            print(ploidies[p1])
            print(envts[e1])
            print(floci[l1a])
            print('********************************')

In [None]:
# Make the same thing except for the "non-intuitive" example now

# common plot parameters
mysize = 1
myalpha = 0.5
elw = 0.5
ealpha = 0.15

yorkn=100

gowith = ['lasso_v2',11]
o=10

for p1 in np.arange(1,2):
#for p1 in np.arange(len(ploidies)):
    for e1 in np.arange(3,4):
    #for e1 in np.arange(len(envts)):
        for l1a in np.arange(2,3):
        #for l1a in np.arange(len(floci)):
            # single panel
            fig,ax = plt.subplots(nrows = 2, ncols = 2, sharex=True, sharey='row', figsize=(2.3,2))
            plt.subplots_adjust(wspace=0.5)

            data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Since we're using the observed s, remove the predicted columns
            data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)]).reset_index(drop=True)

            # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
            # represented multiple times.
            glist = list(OrderedDict.fromkeys(data1['genotype']))

            data2 = pd.DataFrame()

            for g in np.arange(len(glist)):
                tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                if len(tempg) == 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs']
                    data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs-err']
                elif len(tempg) > 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'].mean()
                    my_svar = statistics.variance(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'])
                    my_svar = 0
                    mymean_stderr = np.mean(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs-err']**2)
                    data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                    #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

            data1 = data2

            # Create a column for each locus, and for the genotype with that locus removed
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l]
            for l in np.arange(len(floci)):
                data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]


            tab0 = data1.loc[(data1[floci[l1a]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l1a]]]
            tab1 = data1.loc[(data1[floci[l1a]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l1a]]]

            #temp = pd.merge(tab1,tab0,how='inner',on='without_'+floci[l1a])
            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l1a])

            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)

            # DO TOP LEFT
            markers0,caps0,bars0 = ax[0][0].errorbar(temp['s_x'],temp['s_y'],
                        xerr = temp['stderr(s)_x'],
                        yerr = temp['stderr(s)_y'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][0].get_xlim()
            ys = ax[0][0].get_ylim()

            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],
                                 temp['stderr(s)_x'],temp['stderr(s)_y'],
                                 yorkn)
            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][0].set_xlim(xs)
            ax[0][0].set_ylim(ys)

            # DO TOP RIGHT
            markers0,caps0,bars0 = ax[0][1].errorbar(temp['s_y'],temp['s_x'],
                        xerr = temp['stderr(s)_y'],
                        yerr = temp['stderr(s)_x'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][1].get_xlim()
            ys = ax[0][1].get_ylim()

            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_y'],temp['s_x'],
                                 temp['stderr(s)_y'],temp['stderr(s)_x'],
                                 yorkn)
            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][1].set_xlim(xs)
            ax[0][1].set_ylim(ys)

            # DO BOTTOM LEFT
            markers0,caps0,bars0 = ax[1][0].errorbar(temp['s_x'],temp['s_y'] - temp['s_x'],
                        xerr = temp['stderr(s)_x'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][0].get_xlim()
            ys = ax[1][0].get_ylim()

            ax[1][0].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_x'],temp['s_y'] - temp['s_x'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s original full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_x'],temp['s_y'] - temp['s_x'])
            ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s original full slope = '+str(res.slope))

            ax[1][0].set_xlim(xs)
            ax[1][0].set_ylim(ys)

            # DO BOTTOM RIGHT
            markers0,caps0,bars0 = ax[1][1].errorbar(temp['s_y'],temp['s_x'] - temp['s_y'],
                        xerr = temp['stderr(s)_y'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][1].get_xlim()
            ys = ax[1][1].get_ylim()

            ax[1][1].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_y'],temp['s_x'] - temp['s_y'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s reversion full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_y'],temp['s_x'] - temp['s_y'])
            ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s reversion full slope = '+str(res.slope))

            #ax.set_yticks([-0.4,-0.2,0])
            #ax.set_xticks([-0.4,-0.2,0])

            ax[1][1].set_xlim(xs)
            ax[1][1].set_ylim(ys)

            ax[1][1].set_xlim(xs)
            ax[1][1].set_ylim(ys)

            # Set labels, ticks, and limits

            ax[1][0].set_xlabel('\n'.join(wrap('Fitness, MKT1 30D',9)))
            ax[1][1].set_xlabel('\n'.join(wrap('Fitness, MKT1 30G',9)))

            ax[1][0].set_ylabel('\n'.join(wrap('∆$\phi$, MKT1 D30G',11)))
            ax[1][1].set_ylabel('\n'.join(wrap('∆$\phi$, MKT1 G30D',11)),labelpad=5)

            ax[0][0].set_ylabel('\n'.join(wrap('Fitness, MKT1 30G',9)))
            ax[0][1].set_ylabel('\n'.join(wrap('Fitness, MKT1 30D',9)),labelpad=5)

            for i in np.arange(2):
                for j in np.arange(2):
                    ax[i][j].set_xlim(-0.7,0.02)
            
            for i in np.arange(2):
                ax[0][i].set_ylim(-0.7,0.02)
                ax[1][i].set_ylim(-0.4,0.4)
                ax[0][i].set_yticks([-0.5,0])
                ax[1][i].set_yticks([-0.3,0,0.3])
            
            fig.align_ylabels()
            
            #fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v02.pdf',bbox_inches='tight',dpi=3000)
            fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v03_linregressfordelta.pdf',bbox_inches='tight',dpi=3000)

            plt.show()

            print(ploidies[p1])
            print(envts[e1])
            print(floci[l1a])
            print('********************************')


In [None]:
# Doing the least squares linear regression, do hap 37 whi2
# common plot parameters
mysize = 1
myalpha = 0.5
elw = 0.5
ealpha = 0.15

yorkn=100

gowith = ['lasso_v2',11]
o=10

for p1 in np.arange(0,1):
#for p1 in np.arange(len(ploidies)):
    for e1 in np.arange(0,1):
    #for e1 in np.arange(len(envts)):
        for l1a in np.arange(7,8):
        #for l1a in np.arange(len(floci)):
            # single panel
            fig,ax = plt.subplots(nrows = 2, ncols = 2, sharex=True, sharey='row', figsize=(2.3,2))
            plt.subplots_adjust(wspace=0.5)

            data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Since we're using the observed s, remove the predicted columns
            data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)]).reset_index(drop=True)

            # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
            # represented multiple times.
            glist = list(OrderedDict.fromkeys(data1['genotype']))

            data2 = pd.DataFrame()

            for g in np.arange(len(glist)):
                tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                if len(tempg) == 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs']
                    data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs-err']
                elif len(tempg) > 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'].mean()
                    my_svar = statistics.variance(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'])
                    my_svar = 0
                    mymean_stderr = np.mean(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs-err']**2)
                    data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                    #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

            data1 = data2

            # Create a column for each locus, and for the genotype with that locus removed
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l]
            for l in np.arange(len(floci)):
                data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]


            tab0 = data1.loc[(data1[floci[l1a]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l1a]]]
            tab1 = data1.loc[(data1[floci[l1a]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l1a]]]

            temp = pd.merge(tab1,tab0,how='inner',on='without_'+floci[l1a])
            #temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l1a])

            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)

            # DO TOP LEFT
            markers0,caps0,bars0 = ax[0][0].errorbar(temp['s_x'],temp['s_y'],
                        xerr = temp['stderr(s)_x'],
                        yerr = temp['stderr(s)_y'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][0].get_xlim()
            ys = ax[0][0].get_ylim()

            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],
                                 temp['stderr(s)_x'],temp['stderr(s)_y'],
                                 yorkn)
            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][0].set_xlim(xs)
            ax[0][0].set_ylim(ys)

            # DO TOP RIGHT
            markers0,caps0,bars0 = ax[0][1].errorbar(temp['s_y'],temp['s_x'],
                        xerr = temp['stderr(s)_y'],
                        yerr = temp['stderr(s)_x'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][1].get_xlim()
            ys = ax[0][1].get_ylim()

            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_y'],temp['s_x'],
                                 temp['stderr(s)_y'],temp['stderr(s)_x'],
                                 yorkn)
            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][1].set_xlim(xs)
            ax[0][1].set_ylim(ys)

            # DO BOTTOM LEFT
            markers0,caps0,bars0 = ax[1][0].errorbar(temp['s_x'],temp['s_y'] - temp['s_x'],
                        xerr = temp['stderr(s)_x'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][0].get_xlim()
            ys = ax[1][0].get_ylim()

            ax[1][0].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_x'],temp['s_y'] - temp['s_x'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s original full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_x'],temp['s_y'] - temp['s_x'])
            ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s original full slope = '+str(res.slope))

            ax[1][0].set_xlim(xs)
            ax[1][0].set_ylim(ys)

            # DO BOTTOM RIGHT
            markers0,caps0,bars0 = ax[1][1].errorbar(temp['s_y'],temp['s_x'] - temp['s_y'],
                        xerr = temp['stderr(s)_y'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][1].get_xlim()
            ys = ax[1][1].get_ylim()

            ax[1][1].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_y'],temp['s_x'] - temp['s_y'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s reversion full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_y'],temp['s_x'] - temp['s_y'])
            ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s reversion full slope = '+str(res.slope))

            #ax.set_yticks([-0.4,-0.2,0])
            #ax.set_xticks([-0.4,-0.2,0])

            ax[1][1].set_xlim(xs)
            ax[1][1].set_ylim(ys)

            ax[1][1].set_xlim(xs)
            ax[1][1].set_ylim(ys)

            # Set labels, ticks, and limits

            ax[1][0].set_xlabel('\n'.join(wrap('Fitness, WHI2 262S',9)))
            ax[1][1].set_xlabel('\n'.join(wrap('Fitness, WHI2 262L',9)))

            ax[1][0].set_ylabel('\n'.join(wrap('∆$\phi$, WHI2 S262L',11)))
            ax[1][1].set_ylabel('\n'.join(wrap('∆$\phi$, WHI2 L262S',11)),labelpad=5)

            ax[0][0].set_ylabel('\n'.join(wrap('Fitness, WHI2 262L',9)))
            ax[0][1].set_ylabel('\n'.join(wrap('Fitness, WHI2 262S',9)),labelpad=5)

            for i in np.arange(2):
                for j in np.arange(2):
                    ax[i][j].set_xlim(-0.18,0.03)
            
            for i in np.arange(2):
                ax[0][i].set_ylim(-0.18,0.03)
                ax[1][i].set_ylim(-0.12,0.12)
                ax[0][i].set_yticks([-0.1,0])
                ax[1][i].set_yticks([-0.1,0,0.1])
            
            fig.align_ylabels()
            
            #fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v02.pdf',bbox_inches='tight',dpi=3000)
            fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v03_linregressfordeltarev.pdf',bbox_inches='tight',dpi=3000)

            plt.show()

            print(ploidies[p1])
            print(envts[e1])
            print(floci[l1a])
            print('********************************')


In [None]:
# Make the same thing except for the flat and declining example now
# Make the 2 4-figure plots we'll actually be using in our supplement
# First, make the hap 4NQO MKT1 example, which illustrates an "intuitive" reversion
# Make some example plots for the supplement, to show how we can get diverse ∆s behaviors (up and down, down and down)

# common plot parameters
mysize = 1
myalpha = 0.5
elw = 0.5
ealpha = 0.15

yorkn=100

gowith = ['lasso_v2',11]
o=10

for p1 in np.arange(0,1):
#for p1 in np.arange(len(ploidies)):
    for e1 in np.arange(4,5):
    #for e1 in np.arange(len(envts)):
        for l1a in np.arange(0,1):
        #for l1a in np.arange(len(floci)):
            # single panel
            fig,ax = plt.subplots(nrows = 2, ncols = 2, sharex=True, sharey='row', figsize=(2.3,2))
            plt.subplots_adjust(wspace=0.5)

            data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Since we're using the observed s, remove the predicted columns
            data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)]).reset_index(drop=True)

            # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
            # represented multiple times.
            glist = list(OrderedDict.fromkeys(data1['genotype']))

            data2 = pd.DataFrame()

            for g in np.arange(len(glist)):
                tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                if len(tempg) == 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs']
                    data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs-err']
                elif len(tempg) > 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'].mean()
                    my_svar = statistics.variance(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'])
                    my_svar = 0
                    mymean_stderr = np.mean(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs-err']**2)
                    data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                    #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

            data1 = data2

            # Create a column for each locus, and for the genotype with that locus removed
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l]
            for l in np.arange(len(floci)):
                data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]


            tab0 = data1.loc[(data1[floci[l1a]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l1a]]]
            tab1 = data1.loc[(data1[floci[l1a]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l1a]]]

            #temp = pd.merge(tab1,tab0,how='inner',on='without_'+floci[l1a])
            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l1a])

            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)

            # DO TOP LEFT
            markers0,caps0,bars0 = ax[0][0].errorbar(temp['s_x'],temp['s_y'],
                        xerr = temp['stderr(s)_x'],
                        yerr = temp['stderr(s)_y'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][0].get_xlim()
            ys = ax[0][0].get_ylim()

            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],
                                 temp['stderr(s)_x'],temp['stderr(s)_y'],
                                 yorkn)
            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][0].set_xlim(xs)
            ax[0][0].set_ylim(ys)

            # DO TOP RIGHT
            markers0,caps0,bars0 = ax[0][1].errorbar(temp['s_y'],temp['s_x'],
                        xerr = temp['stderr(s)_y'],
                        yerr = temp['stderr(s)_x'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][1].get_xlim()
            ys = ax[0][1].get_ylim()

            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_y'],temp['s_x'],
                                 temp['stderr(s)_y'],temp['stderr(s)_x'],
                                 yorkn)
            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][1].set_xlim(xs)
            ax[0][1].set_ylim(ys)

            # DO BOTTOM LEFT
            markers0,caps0,bars0 = ax[1][0].errorbar(temp['s_x'],temp['s_y'] - temp['s_x'],
                        xerr = temp['stderr(s)_x'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][0].get_xlim()
            ys = ax[1][0].get_ylim()

            ax[1][0].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_x'],temp['s_y'] - temp['s_x'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s original full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_x'],temp['s_y'] - temp['s_x'])
            ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s original full slope = '+str(res.slope))

            ax[1][0].set_xlim(xs)
            ax[1][0].set_ylim(ys)

            # DO BOTTOM RIGHT
            markers0,caps0,bars0 = ax[1][1].errorbar(temp['s_y'],temp['s_x'] - temp['s_y'],
                        xerr = temp['stderr(s)_y'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][1].get_xlim()
            ys = ax[1][1].get_ylim()

            ax[1][1].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_y'],temp['s_x'] - temp['s_y'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s reversion full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_y'],temp['s_x'] - temp['s_y'])
            ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s reversion full slope = '+str(res.slope))

            #ax.set_yticks([-0.4,-0.2,0])
            #ax.set_xticks([-0.4,-0.2,0])

            ax[1][1].set_xlim(xs)
            ax[1][1].set_ylim(ys)

            # Set labels, ticks, and limits

            ax[1][0].set_xlabel('\n'.join(wrap('Fitness, BUL2 883L',9)))
            ax[1][1].set_xlabel('\n'.join(wrap('Fitness, BUL2 883F',9)))

            ax[1][0].set_ylabel('\n'.join(wrap('∆$\phi$, BUL2 L883F',11)))
            ax[1][1].set_ylabel('\n'.join(wrap('∆$\phi$, BUL2 F883L',11)),labelpad=5)

            ax[0][0].set_ylabel('\n'.join(wrap('Fitness, BUL2 883F',9)))
            ax[0][1].set_ylabel('\n'.join(wrap('Fitness, BUL2 883L',9)),labelpad=5)

            for i in np.arange(2):
                for j in np.arange(2):
                    ax[i][j].set_xlim(-0.13,0.02)
            
            for i in np.arange(2):
                ax[0][i].set_ylim(-0.13,0.02)
                ax[1][i].set_ylim(-0.07,0.07)
                #ax[0][i].set_yticks([-0.5,0])
                #ax[1][i].set_yticks([-0.3,0,0.3])
            
            fig.align_ylabels()
            
            #fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v02.pdf',bbox_inches='tight',dpi=3000)
            fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v03_linregressfordelta.pdf',bbox_inches='tight',dpi=3000)

            plt.show()

            print(ploidies[p1])
            print(envts[e1])
            print(floci[l1a])
            print('********************************')


In [None]:
# Let's do a better version of the flat-down with the least squares linear regression now
# Try hap ypda pma1

# common plot parameters
mysize = 1
myalpha = 0.5
elw = 0.5
ealpha = 0.15

yorkn=100

gowith = ['lasso_v2',11]
o=10

for p1 in np.arange(0,1):
#for p1 in np.arange(len(ploidies)):
    for e1 in np.arange(5,6):
    #for e1 in np.arange(len(envts)):
        for l1a in np.arange(4,5):
        #for l1a in np.arange(len(floci)):
            # single panel
            fig,ax = plt.subplots(nrows = 2, ncols = 2, sharex=True, sharey='row', figsize=(2.3,2))
            plt.subplots_adjust(wspace=0.5)

            data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Since we're using the observed s, remove the predicted columns
            data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)]).reset_index(drop=True)

            # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
            # represented multiple times.
            glist = list(OrderedDict.fromkeys(data1['genotype']))

            data2 = pd.DataFrame()

            for g in np.arange(len(glist)):
                tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                if len(tempg) == 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs']
                    data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs-err']
                elif len(tempg) > 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'].mean()
                    my_svar = statistics.variance(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'])
                    my_svar = 0
                    mymean_stderr = np.mean(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs-err']**2)
                    data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                    #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

            data1 = data2

            # Create a column for each locus, and for the genotype with that locus removed
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l]
            for l in np.arange(len(floci)):
                data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]


            tab0 = data1.loc[(data1[floci[l1a]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l1a]]]
            tab1 = data1.loc[(data1[floci[l1a]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l1a]]]

            temp = pd.merge(tab1,tab0,how='inner',on='without_'+floci[l1a])
            #temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l1a])

            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)

            # DO TOP LEFT
            markers0,caps0,bars0 = ax[0][0].errorbar(temp['s_x'],temp['s_y'],
                        xerr = temp['stderr(s)_x'],
                        yerr = temp['stderr(s)_y'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][0].get_xlim()
            ys = ax[0][0].get_ylim()

            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],
                                 temp['stderr(s)_x'],temp['stderr(s)_y'],
                                 yorkn)
            ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][0].set_xlim(xs)
            ax[0][0].set_ylim(ys)

            # DO TOP RIGHT
            markers0,caps0,bars0 = ax[0][1].errorbar(temp['s_y'],temp['s_x'],
                        xerr = temp['stderr(s)_y'],
                        yerr = temp['stderr(s)_x'],
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[0][1].get_xlim()
            ys = ax[0][1].get_ylim()

            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)

            # Full regression slope
            res = yorkreg_nocorr(temp['s_y'],temp['s_x'],
                                 temp['stderr(s)_y'],temp['stderr(s)_x'],
                                 yorkn)
            ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('full slope = '+str(res[0]))

            ax[0][1].set_xlim(xs)
            ax[0][1].set_ylim(ys)

            # DO BOTTOM LEFT
            markers0,caps0,bars0 = ax[1][0].errorbar(temp['s_x'],temp['s_y'] - temp['s_x'],
                        xerr = temp['stderr(s)_x'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][0].get_xlim()
            ys = ax[1][0].get_ylim()

            ax[1][0].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_x'],temp['s_y'] - temp['s_x'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s original full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_x'],temp['s_y'] - temp['s_x'])
            ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s original full slope = '+str(res.slope))

            ax[1][0].set_xlim(xs)
            ax[1][0].set_ylim(ys)

            # DO BOTTOM RIGHT
            markers0,caps0,bars0 = ax[1][1].errorbar(temp['s_y'],temp['s_x'] - temp['s_y'],
                        xerr = temp['stderr(s)_y'],
                        yerr = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
                        alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

            [bar.set_alpha(ealpha) for bar in bars0]

            xs = ax[1][1].get_xlim()
            ys = ax[1][1].get_ylim()

            ax[1][1].axhline(y=0,color='xkcd:grey',lw=0.5)

            # Full regression slope
            # for v02
            #res = yorkreg_nocorr(temp['s_y'],temp['s_x'] - temp['s_y'],
            #                     temp['stderr(s)_x'],np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2),
            #                     yorkn)
            #ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            #print('∆s reversion full slope = '+str(res[0]))
            
            #for v03
            res = linregress(temp['s_y'],temp['s_x'] - temp['s_y'])
            ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res.slope+res.intercept,color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
            print('∆s reversion full slope = '+str(res.slope))

            #ax.set_yticks([-0.4,-0.2,0])
            #ax.set_xticks([-0.4,-0.2,0])

            ax[1][1].set_xlim(xs)
            ax[1][1].set_ylim(ys)

            # Set labels, ticks, and limits

            ax[1][0].set_xlabel('\n'.join(wrap('Fitness, PMA1 234C',9)))
            ax[1][1].set_xlabel('\n'.join(wrap('Fitness, PMA1 234S',9)))

            ax[1][0].set_ylabel('\n'.join(wrap('∆$\phi$, PMA1 C234S',11)))
            ax[1][1].set_ylabel('\n'.join(wrap('∆$\phi$, PMA1 S234C',11)),labelpad=5)

            ax[0][0].set_ylabel('\n'.join(wrap('Fitness, PMA1 234S',9)))
            ax[0][1].set_ylabel('\n'.join(wrap('Fitness, PMA1 234C',9)),labelpad=5)

            for i in np.arange(2):
                for j in np.arange(2):
                    ax[i][j].set_xlim(-0.65,0.1)
            
            for i in np.arange(2):
                ax[0][i].set_ylim(-0.65,0.1)
                ax[1][i].set_ylim(-0.6,0.6)
                #ax[0][i].set_yticks([-0.5,0])
                #ax[1][i].set_yticks([-0.3,0,0.3])
            
            fig.align_ylabels()
            
            #fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v02.pdf',bbox_inches='tight',dpi=3000)
            fig.savefig('msfigs/SIfigs/'+ploidies[p1]+envts[e1]+floci[l1a]+'_v03_linregressfordeltarev.pdf',bbox_inches='tight',dpi=3000)

            plt.show()

            print(ploidies[p1])
            print(envts[e1])
            print(floci[l1a])
            print('********************************')


In [None]:
# Fig 3C

# Define our ploidies and loci for the TOP ROW
p1 = 0
e1 = 0
l1a = 5
l2a = 7
l3a = 9


# common plot parameters
mysize = 1
myalpha = 0.5
elw = 0.5
ealpha = 0.3

yorkn=100

gowith = ['lasso_v2',11]
o=10

fig,ax = plt.subplots(nrows = 1, ncols = 3, sharex=True, sharey=True, figsize=(2.52,0.8))
fig.subplots_adjust(wspace=0.15)

# 3Ci
data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p1]+'_'+envts[e1]+'_'+str(o)+'.txt',
                     sep='\t',names=['genotype',ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o),ploidies[p1]+'_'+envts[e1]+'_s-obs',ploidies[p1]+'_'+envts[e1]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

# Since we're using the observed s, remove the predicted columns
data1 = data1.drop(columns=[ploidies[p1]+'_'+envts[e1]+'_s-pred_'+str(o)]).reset_index(drop=True)

# Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
# represented multiple times.
glist = list(OrderedDict.fromkeys(data1['genotype']))

data2 = pd.DataFrame()

for g in np.arange(len(glist)):
    tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
    if len(tempg) == 1:
        data2.at[g,'genotype'] = glist[g]
        data2.at[g,'s'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs']
        data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p1]+'_'+envts[e1]+'_s-obs-err']
    elif len(tempg) > 1:
        data2.at[g,'genotype'] = glist[g]
        data2.at[g,'s'] = tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'].mean()
        my_svar = statistics.variance(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs'])
        my_svar = 0
        mymean_stderr = np.mean(tempg[ploidies[p1]+'_'+envts[e1]+'_s-obs-err']**2)
        data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
        #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

data1 = data2

# Create a column for each locus, and for the genotype with that locus removed
for l in np.arange(len(floci)):
    data1[floci[l]] = data1.loc[:,'genotype'].str[l]
for l in np.arange(len(floci)):
    data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]


tab0 = data1.loc[(data1[floci[l1a]] == '0')].copy(deep=True).reset_index(drop=True)
tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l1a]]]
tab1 = data1.loc[(data1[floci[l1a]] == '1')].copy(deep=True).reset_index(drop=True)
tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l1a]]]

#temp = pd.merge(tab1,tab0,how='inner',on='without_'+floci[l])
temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l1a])
#temp['s_diff'] = temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'] - temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x']

#print('x min = '+str(temp['s_x'].min()),' , x max = '+str(temp['s_x'].max()))
#print('y min = '+str(temp['s_y'].min()),' , y max = '+str(temp['s_y'].max()))

for i in np.arange(len(floci)):
    temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)

#export_csv = temp.to_csv(r'20210330_tempcheck_fig3c.csv',index=True,header=True)    

# work in the second locus
templ20 = temp.loc[(temp[floci[l2a]] == 0)].copy(deep=True).reset_index(drop=True)
templ21 = temp.loc[(temp[floci[l2a]] == 1)].copy(deep=True).reset_index(drop=True)

# Create a version where we have a third locus
templ200 = templ20.loc[(templ20[floci[l3a]] == 0)].copy(deep=True).reset_index(drop=True)
templ201 = templ20.loc[(templ20[floci[l3a]] == 1)].copy(deep=True).reset_index(drop=True)
templ210 = templ21.loc[(templ21[floci[l3a]] == 0)].copy(deep=True).reset_index(drop=True)
templ211 = templ21.loc[(templ21[floci[l3a]] == 1)].copy(deep=True).reset_index(drop=True)

markers0,caps0,bars0 = ax[0].errorbar(temp['s_x'],temp['s_y'],
            xerr = temp['stderr(s)_x'],
            yerr = temp['stderr(s)_y'],
            alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:dark grey',elinewidth=elw)

markers1,caps1,bars1 = ax[1].errorbar(templ20['s_x'],templ20['s_y'],
            xerr = templ20['stderr(s)_x'],
            yerr = templ20['stderr(s)_y'],
            alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:cerulean',elinewidth=elw)
markers2,caps2,bars2 = ax[1].errorbar(templ21['s_x'],templ21['s_y'],
            xerr = templ21['stderr(s)_x'],
            yerr = templ21['stderr(s)_y'],
            alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:orange',elinewidth=elw)

markers3,caps3,bars3 = ax[2].errorbar(templ200['s_x'],templ200['s_y'],
                                     xerr = templ200['stderr(s)_x'],
                                     yerr = templ200['stderr(s)_y'],
                                     alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:cerulean',elinewidth=elw)
markers4,caps4,bars4 = ax[2].errorbar(templ201['s_x'],templ201['s_y'],
                                     xerr = templ201['stderr(s)_x'],
                                     yerr = templ201['stderr(s)_y'],
                                     alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:topaz',elinewidth=elw)
markers5,caps5,bars5 = ax[2].errorbar(templ210['s_x'],templ210['s_y'],
                                     xerr = templ210['stderr(s)_x'],
                                     yerr = templ210['stderr(s)_y'],
                                     alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:orange',elinewidth=elw)
markers6,caps6,bars6 = ax[2].errorbar(templ211['s_x'],templ211['s_y'],
                                     xerr = templ211['stderr(s)_x'],
                                     yerr = templ211['stderr(s)_y'],
                                     alpha=myalpha,linestyle='None',marker='.',ms=mysize,color='xkcd:wine red',elinewidth=elw)

[bar.set_alpha(ealpha) for bar in bars0]
[bar.set_alpha(ealpha) for bar in bars1]
[bar.set_alpha(ealpha) for bar in bars2]
[bar.set_alpha(ealpha) for bar in bars3]
[bar.set_alpha(ealpha) for bar in bars4]
[bar.set_alpha(ealpha) for bar in bars5]
[bar.set_alpha(ealpha) for bar in bars6]


#ax[1].set_xlabel('MKT1 WT')
#ax[0].set_ylabel(floci[l1a]+' Mut')
#fig.text(0.44,-0.1,floci[l]+' WT',fontsize=7)
#plt.title(ploidies[p]+'-'+envts[e]+', orange = '+floci[l2]+' = 1')

xs = ax[1].get_xlim()
ys = ax[1].get_ylim()

# for hap/hom 37 MKT1, RHO5
#xs = (-0.14,0.025)
#ys = (-0.14,0.025)

#fig.legend(['WT','Mut'],title=floci[l2],loc='lower right',handlelength=1,bbox_to_anchor=(0.98,0.28))

ax[0].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)
ax[1].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)
ax[2].plot(np.linspace(-1,1),np.linspace(-1,1),color='k',zorder=0)
ax[0].set_xlim(xs)
ax[0].set_ylim(ys)
ax[1].set_xlim(xs)
ax[1].set_ylim(ys)
ax[2].set_xlim(xs)
ax[2].set_ylim(ys)

# Full regression slope
res = yorkreg_nocorr(temp['s_x'],temp['s_y'],
                     #[0]*len(temp),[0]*len(temp),
                     temp['stderr(s)_x'],temp['stderr(s)_y'],
                     yorkn)
ax[0].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=1,zorder=0,lw=0.6)
print('full slope = '+str(res[0]))

res = yorkreg_nocorr(templ20['s_x'],templ20['s_y'],
                     #[0]*len(templ20),[0]*len(templ20),
                     templ20['stderr(s)_x'],templ20['stderr(s)_y'],
                     yorkn)
ax[1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:cerulean',alpha=1,zorder=0,lw=0.6)
print('without'+floci[l2a]+' slope slope = '+str(res[0]))

res = yorkreg_nocorr(templ21['s_x'],templ21['s_y'],
                     #[0]*len(templ21),[0]*len(templ21),
                     templ21['stderr(s)_x'],templ21['stderr(s)_y'],
                     yorkn)
ax[1].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:orange',alpha=1,zorder=0,lw=0.6)
print('with'+floci[l2a]+' slope slope = '+str(res[0]))

res = yorkreg_nocorr(templ200['s_x'],templ200['s_y'],
                     #[0]*len(templ200),[0]*len(templ200),
                     templ200['stderr(s)_x'],templ200['stderr(s)_y'],
                     yorkn)
ax[2].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:cerulean',alpha=1,zorder=0,lw=0.6)
print('no'+floci[l2a]+'no'+floci[l3a]+' slope = '+str(res[0]))

res = yorkreg_nocorr(templ201['s_x'],templ201['s_y'],
                     #[0]*len(templ201),[0]*len(templ201),
                     templ201['stderr(s)_x'],templ201['stderr(s)_y'],
                     yorkn)
ax[2].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:topaz',alpha=1,zorder=0,lw=0.6)
print('no'+floci[l2a]+'yes'+floci[l3a]+' slope = '+str(res[0]))

res = yorkreg_nocorr(templ210['s_x'],templ210['s_y'],
                     #[0]*len(templ210),[0]*len(templ210),
                     templ210['stderr(s)_x'],templ210['stderr(s)_y'],
                     yorkn)
ax[2].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:orange',alpha=1,zorder=0,lw=0.6)
print('yes'+floci[l2a]+'no'+floci[l3a]+' slope = '+str(res[0]))

res = yorkreg_nocorr(templ211['s_x'],templ211['s_y'],
                     #[0]*len(templ211),[0]*len(templ211),
                     templ211['stderr(s)_x'],templ211['stderr(s)_y'],
                     yorkn)
ax[2].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:wine red',alpha=1,zorder=0,lw=0.6)
print('yes'+floci[l2a]+'yes'+floci[l3a]+' slope = '+str(res[0]))

for i in np.arange(3):
    ax[i].set_xlim(-0.17,0.02)
    ax[i].set_ylim(-0.17,0.02)

ax[0].set_yticks([-0.1,0])

ax[0].set_ylabel('\n'.join(wrap('Fitness, RHO5 10S',10)))

fig.align_ylabels()

fig.text(0.35,-0.25,'Fitness, RHO5 10G',fontsize=7)


mylegmark = 3
mycs = 0.1
myhtp = -0.4
myls = 0.1
dot1 = Line2D([], [], color='xkcd:cerulean', marker='.', linestyle='None',
                          markersize=mylegmark, label='L')
dot2 = Line2D([], [], color='xkcd:orange', marker='.', linestyle='None',
                          markersize=mylegmark, label='S')

leg2a = fig.legend(handles=[dot1,dot2],ncol=2,handletextpad=myhtp,loc='lower right',bbox_to_anchor=(0.69,1.22),title='WHI2',
                  borderpad=0.2,columnspacing=mycs,labelspacing=myls,frameon=False)
leg2a._legend_box.align = "left"
leg2a.get_title().set_position((5, 0))

dot1 = Line2D([], [], color='xkcd:cerulean', marker='.', linestyle='None',
                          markersize=mylegmark, label='L, S')
dot2 = Line2D([], [], color='xkcd:topaz', marker='.', linestyle='None',
                          markersize=mylegmark, label='L, P')
dot3 = Line2D([], [], color='xkcd:orange', marker='.', linestyle='None',
                          markersize=mylegmark, label='S, S')
dot4 = Line2D([], [], color='xkcd:wine red', marker='.', linestyle='None',
                          markersize=mylegmark, label='S, P')

leg3a = fig.legend(handles=[dot1,dot2,dot3,dot4],ncol=2,handletextpad=myhtp,loc='lower right',bbox_to_anchor=(1.05,1.22),title='WHI2, AKL1',
                  borderpad=0.2,columnspacing=mycs,labelspacing=myls,frameon=False)
leg3a._legend_box.align = "left"
leg3a.get_title().set_position((5, 0))




fig.savefig('msfigs/Fig3/3Ci_v01.pdf',bbox_inches='tight',dpi=3000)
#fig.savefig('msfigs/Fig3/3Ci_v01.jpg',bbox_inches='tight',dpi=300)

plt.show()


In [None]:
# Fig 3D
# Redo the plot immediately above but flip to be a 4x2 instead of a 2x4

locinow = ['AKL1','MKT1','RHO5','WHI2',1,2,4,5]
#locinow = ['MKT1','WHI2','AKL1','PMA1',1,2,4,5]
#locinow = ['WHI2','MKT1','AKL1','PMA1',0,1,4,5]
#locinow = ['RHO5','WHI2','MKT1','BUL2',1,2,1,2]

yorkn = 100

o=10

for p in np.arange(locinow[4],locinow[5]):
    for e in np.arange(locinow[6],locinow[7]):
        # Do this same thing with observed values
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                                 sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_10',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        # Since we're just using the observed s, remove the pred columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-pred_10']).reset_index(drop=True)
        
        # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
        # represented multiple times.
        glist = list(OrderedDict.fromkeys(data1['genotype']))
        
        data2 = pd.DataFrame()
        
        for g in np.arange(len(glist)):
            tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
            if len(tempg) == 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
                data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
            elif len(tempg) > 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
                my_svar = statistics.variance(tempg[ploidies[p]+'_'+envts[e]+'_s-obs'])
                my_svar = 0
                mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
                data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)
        
        data1 = data2
        
        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        mysize = 1
        myalpha = 0.7
        elw = 0.5
        ealpha = 0.15
        lws = 0.1
        
        colorsnow = ['xkcd:cerulean','xkcd:orange','xkcd:bright green','xkcd:indigo','xkcd:pink','xkcd:rust','xkcd:gold','xkcd:bright aqua']

        tab0 = data1.loc[(data1[locinow[0]] == '0')].copy(deep=True).reset_index(drop=True)
        tab0 = tab0[['genotype','s','stderr(s)','without_'+locinow[0]]]
        tab1 = data1.loc[(data1[locinow[0]] == '1')].copy(deep=True).reset_index(drop=True)
        tab1 = tab1[['genotype','s','stderr(s)','without_'+locinow[0]]]

        temp = pd.merge(tab0,tab1,how='inner',on='without_'+locinow[0])
        

        for i in np.arange(len(floci)):
            temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)
            

        myalpha=0.7
        mysize = 1
        
        mytoplot0 = temp.loc[(temp[locinow[1]] == 0)]
        mytoplot1 = temp.loc[(temp[locinow[1]] == 1)]
        
        mytoplot00 = mytoplot0.loc[(mytoplot0[locinow[2]] == 0)]
        mytoplot01 = mytoplot0.loc[(mytoplot0[locinow[2]] == 1)]
        
        mytoplot10 = mytoplot1.loc[(mytoplot1[locinow[2]] == 0)]
        mytoplot11 = mytoplot1.loc[(mytoplot1[locinow[2]] == 1)]
        
        mytoplot000 = mytoplot00.loc[(mytoplot00[locinow[3]] == 0)]
        mytoplot001 = mytoplot00.loc[(mytoplot00[locinow[3]] == 1)]
        
        mytoplot010 = mytoplot01.loc[(mytoplot01[locinow[3]] == 0)]
        mytoplot011 = mytoplot01.loc[(mytoplot01[locinow[3]] == 1)]
        
        mytoplot100 = mytoplot10.loc[(mytoplot10[locinow[3]] == 0)]
        mytoplot101 = mytoplot10.loc[(mytoplot10[locinow[3]] == 1)]
        
        mytoplot110 = mytoplot11.loc[(mytoplot11[locinow[3]] == 0)]
        mytoplot111 = mytoplot11.loc[(mytoplot11[locinow[3]] == 1)]

        mymec = 'xkcd:dark grey'
        mysize = 1.5
        
        # FIGURE DEFINITION
        fig,ax = plt.subplots(nrows = 4, ncols = 2, sharex=True, sharey=True, figsize=(1.36,2.84))
        fig.subplots_adjust(wspace=0.2,hspace=0.2)
        
        markers1,caps1,bars1 = ax[0][0].errorbar(mytoplot000['s_x'],mytoplot000['s_y'],
                                xerr = mytoplot000['stderr(s)_x'],
                                yerr = mytoplot000['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[0],elinewidth=elw,zorder=4)
        
        markers2,caps2,bars2 = ax[1][0].errorbar(mytoplot001['s_x'],mytoplot001['s_y'],
                                xerr = mytoplot001['stderr(s)_x'],
                                yerr = mytoplot001['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[1],elinewidth=elw,zorder=4)
        
        markers3,caps3,bars3 = ax[2][0].errorbar(mytoplot010['s_x'],mytoplot010['s_y'],
                                xerr = mytoplot010['stderr(s)_x'],
                                yerr = mytoplot010['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[2],elinewidth=elw,zorder=4)
        
        markers4,caps4,bars4 = ax[3][0].errorbar(mytoplot011['s_x'],mytoplot011['s_y'],
                                xerr = mytoplot011['stderr(s)_x'],
                                yerr = mytoplot011['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[3],elinewidth=elw,zorder=4)
        
        markers5,caps5,bars5 = ax[0][1].errorbar(mytoplot100['s_x'],mytoplot100['s_y'],
                                xerr = mytoplot100['stderr(s)_x'],
                                yerr = mytoplot100['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[4],elinewidth=elw,zorder=4)
        
        markers6,caps6,bars6 = ax[1][1].errorbar(mytoplot101['s_x'],mytoplot101['s_y'],
                                xerr = mytoplot101['stderr(s)_x'],
                                yerr = mytoplot101['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[5],elinewidth=elw,zorder=4)
        
        markers7,caps7,bars7 = ax[2][1].errorbar(mytoplot110['s_x'],mytoplot110['s_y'],
                                xerr = mytoplot110['stderr(s)_x'],
                                yerr = mytoplot110['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[6],elinewidth=elw,zorder=4)
        
        markers8,caps8,bars8 = ax[3][1].errorbar(mytoplot111['s_x'],mytoplot111['s_y'],
                                xerr = mytoplot111['stderr(s)_x'],
                                yerr = mytoplot111['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize*3+lws,mew=lws,mec=mymec,color=colorsnow[7],elinewidth=elw,zorder=4)
        
        [bar.set_alpha(ealpha) for bar in bars1]
        [bar.set_alpha(ealpha) for bar in bars2]
        [bar.set_alpha(ealpha) for bar in bars3]
        [bar.set_alpha(ealpha) for bar in bars4]
        [bar.set_alpha(ealpha) for bar in bars5]
        [bar.set_alpha(ealpha) for bar in bars6]
        [bar.set_alpha(ealpha) for bar in bars7]
        [bar.set_alpha(ealpha) for bar in bars8]
        
        #leave out for now until can fix
        #fig.legend(['000','001','010','011','100','101','110','111'],bbox_to_anchor=(0.9, 1), loc='upper left')
        
        xs = ax[0][1].get_xlim()
        ys = ax[0][1].get_ylim()
        xs = (-0.12,0.01)
        ys = (-0.12,0.01)
        
        # subslopes
        res000 = yorkreg_nocorr(mytoplot000['s_x'],mytoplot000['s_y'],mytoplot000['stderr(s)_x'],mytoplot000['stderr(s)_y'],yorkn)
        res001 = yorkreg_nocorr(mytoplot001['s_x'],mytoplot001['s_y'],mytoplot001['stderr(s)_x'],mytoplot001['stderr(s)_y'],yorkn)
        res010 = yorkreg_nocorr(mytoplot010['s_x'],mytoplot010['s_y'],mytoplot010['stderr(s)_x'],mytoplot010['stderr(s)_y'],yorkn)
        res011 = yorkreg_nocorr(mytoplot011['s_x'],mytoplot011['s_y'],mytoplot011['stderr(s)_x'],mytoplot011['stderr(s)_y'],yorkn)
        res100 = yorkreg_nocorr(mytoplot100['s_x'],mytoplot100['s_y'],mytoplot100['stderr(s)_x'],mytoplot100['stderr(s)_y'],yorkn)
        res101 = yorkreg_nocorr(mytoplot101['s_x'],mytoplot101['s_y'],mytoplot101['stderr(s)_x'],mytoplot101['stderr(s)_y'],yorkn)
        res110 = yorkreg_nocorr(mytoplot110['s_x'],mytoplot110['s_y'],mytoplot110['stderr(s)_x'],mytoplot110['stderr(s)_y'],yorkn)
        res111 = yorkreg_nocorr(mytoplot111['s_x'],mytoplot111['s_y'],mytoplot111['stderr(s)_x'],mytoplot111['stderr(s)_y'],yorkn)
        
        ax[0][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res000[0]+res000[1],color=colorsnow[0],alpha=0.7,zorder=3,lw=0.6)
        ax[1][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res001[0]+res001[1],color=colorsnow[1],alpha=0.7,zorder=3,lw=0.6)
        ax[2][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res010[0]+res010[1],color=colorsnow[2],alpha=0.7,zorder=3,lw=0.6)
        ax[3][0].plot(np.linspace(-1,1),np.linspace(-1,1)*res011[0]+res011[1],color=colorsnow[3],alpha=0.7,zorder=3,lw=0.6)
        ax[0][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res100[0]+res100[1],color=colorsnow[4],alpha=0.7,zorder=3,lw=0.6)
        ax[1][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res101[0]+res101[1],color=colorsnow[5],alpha=0.7,zorder=3,lw=0.6)
        ax[2][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res110[0]+res110[1],color=colorsnow[6],alpha=0.7,zorder=3,lw=0.6)
        ax[3][1].plot(np.linspace(-1,1),np.linspace(-1,1)*res111[0]+res111[1],color=colorsnow[7],alpha=0.7,zorder=3,lw=0.6)
        
        bgcolor = '#C0C7BD'
        
        markers1b,caps1b,bars1b = ax[0][0].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)
        
        markers2b,caps2b,bars2b = ax[1][0].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)
        
        markers3b,caps3b,bars3b = ax[2][0].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)
        
        markers4b,caps4b,bars4b = ax[3][0].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)
        
        markers5b,caps5b,bars5b = ax[0][1].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)
        
        markers6b,caps6b,bars6b = ax[1][1].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)
        
        markers7b,caps7b,bars7b = ax[2][1].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)
        
        markers8b,caps8b,bars8b = ax[3][1].errorbar(temp['s_x'],temp['s_y'],
                                xerr = temp['stderr(s)_x'],
                                yerr = temp['stderr(s)_y'],
                                alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=bgcolor,elinewidth=elw,zorder=2)

        
        [bar.set_alpha(ealpha) for bar in bars1b]
        [bar.set_alpha(ealpha) for bar in bars2b]
        [bar.set_alpha(ealpha) for bar in bars3b]
        [bar.set_alpha(ealpha) for bar in bars4b]
        [bar.set_alpha(ealpha) for bar in bars5b]
        [bar.set_alpha(ealpha) for bar in bars6b]
        [bar.set_alpha(ealpha) for bar in bars7b]
        [bar.set_alpha(ealpha) for bar in bars8b]
        

        for i in np.arange(2):
            for j in np.arange(4):
                ax[j][i].plot(np.linspace(-1,1),np.linspace(-1,1),color='xkcd:black',zorder=0,lw=0.8)
        
        fig.text(-0.14,0.37,'Fitness, AKL1 176P',fontsize=7,rotation='vertical')
        fig.text(0.2,0.02,'Fitness, AKL1 176S',fontsize=7)

        yorkn=100
        # Full regression slope
        res = yorkreg_nocorr(temp['s_x'],temp['s_y'],temp['stderr(s)_x'],temp['stderr(s)_y'],yorkn)
        
        for i in np.arange(2):
            for j in np.arange(4):
                ax[j][i].plot(np.linspace(-1,1),np.linspace(-1,1)*res[0]+res[1],color='xkcd:dark grey',alpha=0.7,zorder=1,lw=0.6)
        #print('full slope = '+str(res[0][0]))

        for i in np.arange(2):
            for j in np.arange(4):
                ax[j][i].set_xlim(xs)
                ax[j][i].set_ylim(ys)
        
        for i in np.arange(4):
            ax[i][0].set_yticks([-0.1,0])
            ax[i][0].tick_params(axis='y', which='major', pad=0.7)
        
        fig.savefig('msfigs/Fig3/homsulocAKL1_8panel_RAW_newerr_2cols.pdf',bbox_inches='tight',dpi=1000)



In [None]:
# Figure 3B
# Will do a bunch of hierarchical linear regressions now.
# First, do a linear regression on the whole batch of points.
# Then separate into two batches based on an independent mutation (10 pairs total) and perform regressions on each.
# Then divide further into 4 batches based on pairwise combinations of mutations in the background.
# And finally go to 8 batches, max we'll do, based on 3rd-order combinations of mutations in the background.

gowith = ['lasso_v2',11]

# Get the data

o1tab = pd.DataFrame()
o2tab = pd.DataFrame()
o3tab = pd.DataFrame()

myindo1 = 0
myindo2 = 0
myindo3 = 0

o = 10

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):    
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):

        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        # Since we're just using the predicted s, remove the obs columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)

        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        # So we can plot in different ways, import the background-averaged fitness effects of each mutation
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(gowith[1]-1)+'.txt',
                                     sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(gowith[1]-1),'na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)

        data0['numMut'] = data0[floci].sum(axis=1)

        data0 = data0.loc[(data0['numMut'] == 1)].sort_values(by='genotype',ascending=False).reset_index(drop=True)

        mysize = 15
        myalpha = 0.5
        

        # Divvy up the data by focal locus
        #for l in np.arange(2,3):
        for l in np.arange(len(floci)):

            tab0 = data1.loc[(data1[floci[l]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),'without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),'without_'+floci[l]]]

            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])
            
            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)

            
            # Perform a linear regression, capturing slope, intercept, and R^2
            res = linregress(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
            res
            o1tab.at[myindo1,'ploidy'] = ploidies[p]
            o1tab.at[myindo1,'envt'] = envts[e]
            o1tab.at[myindo1,'locus'] = floci[l]
            o1tab.at[myindo1,'slope-o1'] = res.slope
            o1tab.at[myindo1,'intercept-o1'] = res.intercept
            o1tab.at[myindo1,'r^2-o1'] = res.rvalue**2
            
            myindo1 = myindo1 + 1
            
            for l2 in np.arange(len(floci)):
                if l != l2:
                    templ20 = temp.loc[(temp[floci[l2]] == 0)].copy(deep=True).reset_index(drop=True)
                    templ21 = temp.loc[(temp[floci[l2]] == 1)].copy(deep=True).reset_index(drop=True)
                    
                    resl20 = linregress(templ20[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],templ20[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
                    
                    resl21 = linregress(templ21[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],templ21[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
                    
                    o2tab.at[myindo2,'ploidy'] = ploidies[p]
                    o2tab.at[myindo2,'envt'] = envts[e]
                    o2tab.at[myindo2,'locus-1'] = floci[l]
                    o2tab.at[myindo2,'locus-2'] = floci[l2]
                    o2tab.at[myindo2,'slope-o2, 0'] = resl20.slope
                    o2tab.at[myindo2,'intercept-o2, 0'] = resl20.intercept
                    o2tab.at[myindo2,'r^2-o2, 0'] = resl20.rvalue**2
                    o2tab.at[myindo2,'slope-o2, 1'] = resl21.slope
                    o2tab.at[myindo2,'intercept-o2, 1'] = resl21.intercept
                    o2tab.at[myindo2,'r^2-o2, 1'] = resl21.rvalue**2
                    
                    myindo2 = myindo2 + 1
                    
                    for l3 in np.arange(len(floci)):
                        if l2 != l3 and l != l3:
                            templ2030 = templ20.loc[(templ20[floci[l3]] == 0)].copy(deep=True).reset_index(drop=True)
                            templ2031 = templ20.loc[(templ20[floci[l3]] == 1)].copy(deep=True).reset_index(drop=True)
                            templ2130 = templ21.loc[(templ21[floci[l3]] == 0)].copy(deep=True).reset_index(drop=True)
                            templ2131 = templ21.loc[(templ21[floci[l3]] == 1)].copy(deep=True).reset_index(drop=True)
                            
                            resl2030 = linregress(templ2030[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],templ2030[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
                            resl2031 = linregress(templ2031[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],templ2031[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
                            resl2130 = linregress(templ2130[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],templ2130[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
                            resl2131 = linregress(templ2131[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],templ2131[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
                            
                            o3tab.at[myindo3,'ploidy'] = ploidies[p]
                            o3tab.at[myindo3,'envt'] = envts[e]
                            o3tab.at[myindo3,'locus-1'] = floci[l]
                            o3tab.at[myindo3,'locus-2'] = floci[l2]
                            o3tab.at[myindo3,'locus-3'] = floci[l3]
                            o3tab.at[myindo3,'slope-o3, 00'] = resl2030.slope
                            o3tab.at[myindo3,'intercept-o3, 00'] = resl2030.intercept
                            o3tab.at[myindo3,'r^2-o3, 00'] = resl2030.rvalue**2
                            o3tab.at[myindo3,'slope-o3, 01'] = resl2031.slope
                            o3tab.at[myindo3,'intercept-o3, 01'] = resl2031.intercept
                            o3tab.at[myindo3,'r^2-o3, 01'] = resl2031.rvalue**2
                            o3tab.at[myindo3,'slope-o3, 10'] = resl2130.slope
                            o3tab.at[myindo3,'intercept-o3, 10'] = resl2130.intercept
                            o3tab.at[myindo3,'r^2-o3, 10'] = resl2130.rvalue**2
                            o3tab.at[myindo3,'slope-o3, 11'] = resl2131.slope
                            o3tab.at[myindo3,'intercept-o3, 11'] = resl2131.intercept
                            o3tab.at[myindo3,'r^2-o3, 11'] = resl2131.rvalue**2
                            
                            myindo3 = myindo3 + 1

In [None]:
len(o1tab.loc[o1tab[]])

In [None]:
# To start out, just to see, plot all the first-order slopes and R^2 values
plt.scatter(o1tab['slope-o1'],o1tab['r^2-o1'],linestyle='None',alpha=0.5)
plt.axvline(x=1,c='xkcd:grey',lw=0.5,zorder=0)
plt.xlabel('slope')
plt.ylabel('R^2')
plt.title('first order')
plt.show()

plt.scatter(o1tab['slope-o1'],o1tab['intercept-o1'],linestyle='None',alpha=0.5)
plt.axhline(y=0,c='xkcd:grey',lw=0.5,zorder=0)
plt.axvline(x=1,c='xkcd:grey',lw=0.5,zorder=0)
plt.xlabel('slope')
plt.ylabel('intercept')
plt.title('first order')
plt.show()

plt.hist(o1tab['slope-o1'],bins=30)
plt.xlabel('slope')
plt.show()
print('mean slope = '+str(o1tab['slope-o1'].mean()))
print('median slope = '+str(o1tab['slope-o1'].median()))

plt.hist(o1tab['r^2-o1'],bins=30)
plt.xlabel('R^2')
plt.show()
print('mean R^2 = '+str(o1tab['r^2-o1'].mean()))
print('median R^2 = '+str(o1tab['r^2-o1'].median()))

myperc = len(o1tab.loc[(abs(o1tab['slope-o1'] - 1) > .1)])/len(o1tab)
print(str(myperc)+' of slopes deviate from 1 by > 0.1')

In [None]:
# Do the same as above but with york regression, plot histogram of slopes
gowith = ['lasso_v2',11]
#THIS TAKES A WHILE TO RUN TOO
# Get the data

o1tab = pd.DataFrame()

myindo1 = 0

o = 10

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):    
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):

        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        # Since we're just using the predicted s, remove the obs columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)

        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        # So we can plot in different ways, import the background-averaged fitness effects of each mutation
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(gowith[1]-1)+'.txt',
                                     sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(gowith[1]-1),'na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)

        data0['numMut'] = data0[floci].sum(axis=1)

        #data0 = data0.loc[(data0['numMut'] == 1)].sort_values(by='genotype',ascending=False).reset_index(drop=True)

        mysize = 15
        myalpha = 0.5
        

        # Divvy up the data by focal locus
        #for l in np.arange(2,3):
        for l in np.arange(len(floci)):

            tab0 = data1.loc[(data1[floci[l]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),'without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),'without_'+floci[l]]]

            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])
            
            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)
            
            temp['∆s(y-x)'] = temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'] - temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x']

            
            # Perform a linear regression, capturing slope and intercept
            yorkn = 100
            #res = linregress(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
            res = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'],[0]*len(temp),[0]*len(temp),yorkn)
            o1tab.at[myindo1,'ploidy'] = ploidies[p]
            o1tab.at[myindo1,'envt'] = envts[e]
            o1tab.at[myindo1,'locus'] = floci[l]
            o1tab.at[myindo1,'slope'] = res[0]
            o1tab.at[myindo1,'intercept'] = res[1]
            o1tab.at[myindo1,'stderr(slope)'] = np.sqrt(res[2])
            o1tab.at[myindo1,'additive_s'] = data0.loc[(data0[floci[l]] == 1)&(data0['numMut'] == 1),ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)].values[0]
            o1tab.at[myindo1,'epmagsum'] = abs(data0.loc[(data0[floci[l]] == 1)&(data0['numMut'] > 1),ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)]).sum()
            
            resdo = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp['∆s(y-x)'],[0]*len(temp),[0]*len(temp),yorkn)
            resdr = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'],-1*temp['∆s(y-x)'],[0]*len(temp),[0]*len(temp),yorkn)
            o1tab.at[myindo1,'delta_s-slope_original']  = resdo[0]
            o1tab.at[myindo1,'delta_s-slope_reversion']  = resdr[0]
            
            myindo1 = myindo1 + 1

for i in np.arange(len(o1tab)):
    if abs(o1tab.loc[i,'slope']) > 1:
        o1tab.at[i,'abs_slope'] = 1/o1tab.loc[i,'slope']
        o1tab.at[i,'abs_stderr(slope)'] = abs(o1tab.loc[i,'stderr(slope)']/(o1tab.loc[i,'slope']**2))
    else:
        o1tab.at[i,'abs_slope'] = o1tab.loc[i,'slope']
        o1tab.at[i,'abs_stderr(slope)'] = o1tab.loc[i,'stderr(slope)']

In [None]:
# Real Figure 3B I think 2021.03.18
plt.scatter(abs(o1tab['additive_s']),o1tab['abs_slope'],alpha=0.5)
plt.ylim(-0.25,1.02)
#plt.ylim(1/1.2,1.2)
plt.axhline(y=1,color='xkcd:grey',zorder=0,lw=0.5)
plt.axhline(y=0,color='xkcd:grey',zorder=0,lw=0.5)
plt.axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
plt.xlabel('|s|')
plt.ylabel('< 1 version of slope')
plt.show()

# updated version of the above
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(2,2), gridspec_kw={'width_ratios': [2,1]},constrained_layout=True,sharey=True)
axes[0].scatter(abs(o1tab['additive_s']),o1tab['abs_slope'],alpha=0.5,s=5)
axes[0].set_ylim(-0.25,1.04)
axes[0].axhline(y=1,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].axhline(y=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].set_xlabel('|additive effect|')
axes[0].set_ylabel('regression slope')

axes[1].hist(o1tab['abs_slope'],orientation='horizontal',bins=50)
axes[1].set_xlabel('count')
right_side = axes[1].spines["right"]
top_side = axes[1].spines["top"]
bottom_side = axes[1].spines["bottom"]
right_side.set_visible(False)
top_side.set_visible(False)
bottom_side.set_visible(False)

fig.savefig('msfigs/Fig3/slopesAdds.pdf',bbox_inches='tight',dpi=1000)

plt.show()


plt.hist(o1tab['slope'],bins=100)
plt.xlabel('slope')
plt.show()
print('mean slope = '+str(o1tab['slope'].mean()))
print('median slope = '+str(o1tab['slope'].median()))
o1tab.sort_values(by='slope')
plt.hist(o1tab.loc[(o1tab['slope'] > -2.5)&(o1tab['slope'] < 2.5)]['slope'],bins=30)
plt.xlabel('slope')
plt.show()

plt.scatter(o1tab['additive_s'],o1tab['slope'])
plt.ylim(0,2.5)
plt.axhline(y=1,color='xkcd:grey',zorder=0,lw=0.5)
plt.axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
plt.xlabel('s')
plt.ylabel('slope')
plt.show()

plt.scatter(abs(o1tab['additive_s']),o1tab['slope'])
plt.ylim(0,2.5)
#plt.ylim(1/1.2,1.2)
plt.axhline(y=1,color='xkcd:grey',zorder=0,lw=0.5)
plt.axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
plt.xlabel('|s|')
plt.ylabel('slope')
plt.show()


        


myu = 1.05
myl = 1/myu
myumoreprop = len(o1tab.loc[(o1tab['slope']>myu)])/len(o1tab)
myllessprop = len(o1tab.loc[(o1tab['slope']<myl)])/len(o1tab)
print('proportion of slopes above '+str(myu)+' = '+str(myumoreprop))
print('proportion of slopes below '+str(myl)+' = '+str(myllessprop))
print('sum of these is '+str(myumoreprop+myllessprop))

len(o1tab.loc[(abs(o1tab['additive_s'])>0.005)&(o1tab['slope']<=myu)&(o1tab['slope']>=myl)])/len(o1tab)


plt.scatter(o1tab['epmagsum'],o1tab['abs_slope'],alpha=0.6)
#plt.ylim(0,2.5)
plt.xlabel('sum of magnitude of epistatic terms')
plt.ylabel('> 1 version of slope')
plt.show()

plt.scatter(o1tab['epmagsum'],o1tab['delta_s-slope_original'],alpha=0.6)
plt.show()

plt.scatter(o1tab['epmagsum'],o1tab['delta_s-slope_reversion'],alpha=0.6)
plt.show()

plt.scatter(o1tab['delta_s-slope_original'],o1tab['delta_s-slope_reversion'],alpha=0.6)
plt.show()

plt.scatter(o1tab['slope'],o1tab['delta_s-slope_original'],alpha=0.6)
plt.xlabel('abc slope')
plt.ylabel('∆s_orig slope')
plt.axhline(y=0,color='xkcd:grey',zorder=0,lw=0.5)
plt.axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
plt.xlim(-0.5,2.5)
plt.xlim(0.7,1.3)
plt.show()

plt.scatter(o1tab['slope'],o1tab['delta_s-slope_reversion'],alpha=0.6)
plt.xlabel('abc slope')
plt.ylabel('∆s_rev slope')
plt.axhline(y=0,color='xkcd:grey',zorder=0,lw=0.5)
plt.axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
plt.xlim(-0.5,2.5)
plt.xlim(0.7,1.3)
plt.show()

In [None]:
# Instead, want 3B to be with observed data, slopes (but still additive effects)
# So go through and create a version of o1tab with these values in them
 
gowith = ['lasso_v2',11]
#THIS TAKES A WHILE TO RUN TOO
# Get the data

obstab = pd.DataFrame()

myindo1 = 0

o = 10

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):    
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):

        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),'s','stderr(s)'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        # Since we're just using the observed s, remove the pred columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)]).reset_index(drop=True)
        
        # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
        # represented multiple times.
        glist = list(OrderedDict.fromkeys(data1['genotype']))

        data2 = pd.DataFrame()

        for g in np.arange(len(glist)):
            tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
            if len(tempg) == 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg.loc[0,'s']
                data2.at[g,'stderr(s)'] = tempg.loc[0,'stderr(s)']
            elif len(tempg) > 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg['s'].mean()
                #my_svar = statistics.variance(tempg['s'])
                my_svar = 0
                mymean_stderr = np.mean(tempg['stderr(s)']**2)
                data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

        data1 = data2

        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        # So we can plot in different ways, import the background-averaged fitness effects of each mutation
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(gowith[1]-1)+'.txt',
                                     sep='\t',names=['todelete','genotype','coeff','na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)

        data0['numMut'] = data0[floci].sum(axis=1)

        # Just additive effects!
        data0 = data0.loc[(data0['numMut'] == 1)].sort_values(by='genotype',ascending=False).reset_index(drop=True)

        mysize = 15
        myalpha = 0.5
        
        # Divvy up the data by focal locus
        #for l in np.arange(5,6):
        for l in np.arange(len(floci)):

            tab0 = data1.loc[(data1[floci[l]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l]]]

            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])
            
            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)
            
            # Perform a linear regression, capturing slope and intercept
            yorkn = 100
            #res = linregress(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],temp['stderr(s)_x'],temp['stderr(s)_y'],yorkn)
            obstab.at[myindo1,'ploidy'] = ploidies[p]
            obstab.at[myindo1,'envt'] = envts[e]
            obstab.at[myindo1,'locus'] = floci[l]
            obstab.at[myindo1,'slope'] = res[0]
            obstab.at[myindo1,'intercept'] = res[1]
            obstab.at[myindo1,'stderr(slope)'] = np.sqrt(res[2])
            obstab.at[myindo1,'additive_s'] = data0.loc[(data0[floci[l]] == 1)&(data0['numMut'] == 1),'coeff'].values[0]
            
            #resdo = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp['∆s(y-x)'],[0]*len(temp),[0]*len(temp),yorkn)
            #resdr = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'],-1*temp['∆s(y-x)'],[0]*len(temp),[0]*len(temp),yorkn)
            #obstab.at[myindo1,'delta_s-slope_original']  = resdo[0]
            #obstab.at[myindo1,'delta_s-slope_reversion']  = resdr[0]
            
            myindo1 = myindo1 + 1

for i in np.arange(len(obstab)):
    if abs(obstab.loc[i,'slope']) > 1:
        obstab.at[i,'abs_slope'] = 1/obstab.loc[i,'slope']
        obstab.at[i,'abs_stderr(slope)'] = abs(obstab.loc[i,'stderr(slope)']/(obstab.loc[i,'slope']**2))
    else:
        obstab.at[i,'abs_slope'] = obstab.loc[i,'slope']
        obstab.at[i,'abs_stderr(slope)'] = obstab.loc[i,'stderr(slope)']

In [None]:
# flip the x axis
# Same as immediately above except rotated 90°
# Do updated version of fig 3B with OBSERVED data

# remake this with the standard errors of the slopes as yerror
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(1.38,.9), gridspec_kw={'height_ratios': [1.2,2]},sharex=True)
fig.subplots_adjust(hspace=0.25)
axes[1].errorbar(obstab['abs_slope'],abs(obstab['additive_s']),
                 xerr = obstab['abs_stderr(slope)'], yerr = None,
                 alpha=0.5,elinewidth=0.5,linestyle='None',marker='.',ms=3)
#axes[1].scatter(abs(obstab['additive_s']),obstab['abs_slope'],alpha=0.5,s=5)
axes[1].set_xlim(1.04,-0.25)
axes[1].set_ylim(-0.005,0.11)
axes[1].axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
axes[1].axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[1].axvline(x=0.9,color='xkcd:orange',zorder=0,lw=0.5,linestyle = 'dashed')
axes[1].axhline(y=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[1].set_ylabel('\n'.join(wrap('|additive effect|',9)))
axes[1].set_xlabel('regression slope, $b$',labelpad=3)
axes[1].set_yticks([0,0.1])
axes[1].tick_params(axis='x',pad=1.5)


axes[0].hist(obstab['abs_slope'],bins=50)
axes[0].set_ylabel('count')
axes[0].axvline(x=0.9,color='xkcd:orange',zorder=0,lw=0.5,linestyle = 'dashed')
right_side = axes[0].spines["right"]
top_side = axes[0].spines["top"]
left_side = axes[0].spines["left"]
right_side.set_visible(False)
top_side.set_visible(False)
left_side.set_visible(False)

fig.align_ylabels()

fig.savefig('msfigs/Fig3/slopesAdds_errorbar_v2_flip_flip.pdf',bbox_inches='tight',dpi=1000)

plt.show()

In [None]:
# Want to do ABC vs aBC plots now where, instead of subtracting terms by rank,
# we add them by rank.

gonna take ages (about 3.5h)
#time the code
mystart = time.perf_counter()

megaadder = pd.DataFrame()
o=10
ncyc = 30
ncycthresh = 2
#pthresh = 0.01
yorkn = 100
diffthresh = 0.01

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):
        
        print(ploidies[p]+'_'+envts[e])
        
        # First, import a list of genotypes to predict
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        # Since we're just using the predicted s, remove the obs columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)

        # Create a column for each locus
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
            data1.loc[(data1[floci[l]] == 0),floci[l]] = -1
        
        # Now import a set of coefficients
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                                             sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_term','na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        data0['numMut'] = data0[floci].sum(axis=1)
        
        # Add a "baseline" column for whether these terms are added or subtracted in the WT
        for i in np.arange(len(data0)):
            if data0.loc[i,'numMut'] % 2 == 1:
                data0.at[i,'baseline'] = -1
            else:
                data0.at[i,'baseline'] = 1
        
        
        #for l in np.arange(2,3):
        for l in np.arange(len(floci)):
        
            # Want to start with "no-epistasis" predictions in data1
            # builder is where we build up coefficients for fitness predictions
            # Start by setting all epistatic terms INVOLVING THE FOCAL LOCUS to 0

            builder = data0.copy(deep=True)
            builder.at[(builder['numMut'] > 1)&(builder[floci[l]] == 1),ploidies[p]+'_'+envts[e]+'_term'] = 0

            for g in np.arange(len(data1)):
                temp = builder.copy(deep=True)
                # to make life faster, remove all zero values
                temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                for t in np.arange(len(temp)):
                    temprowlist = []
                    for locus in np.arange(len(floci)):
                        temprowlist = temprowlist + [temp.loc[t,floci[locus]]*data1.loc[g,floci[locus]]]
                    #remove zeros
                    temprowlist = [value for value in temprowlist if value != 0]
                    # find product
                    firstprod = np.prod(temprowlist)
                    # subtract out baseline
                    firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                    # multiply by term's value
                    mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                    temp.at[t,'tosum'] = mytermval
                totalsum = temp['tosum'].sum()
                myintercept = data1.loc[(data1['genotype'] == '0000000000'),ploidies[p]+'_'+envts[e]+'_'+'Alex prediction'].values[0]
                data1.at[g,'noEp_pred'] = totalsum + myintercept
            
            locusadder = pd.DataFrame()
            
            tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','noEp_pred','without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','noEp_pred','without_'+floci[l]]]

            templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

            for i in np.arange(len(floci)):
                templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)
                
            # Get the 1:1 R^2 value
            my_x = templ['noEp_pred_x']
            my_y = templ['noEp_pred_y']
            #my_y_mean = np.mean(my_y)
            #my_x_mean = np.mean(my_x)
            #TSS = np.sum((my_y - my_y_mean)**2)
            #RSS = np.sum((my_y - my_y_mean - my_x + my_x_mean)**2)
            
            locusadder.at[0,'num term add'] = 0
            locusadder.at[0,'term added'] = np.nan
            locusadder.at[0,'term order'] = np.nan
            locusadder.at[0,'coefficient'] = np.nan
            #locusadder.at[0,'1:1 R2'] = 1-RSS/TSS
            
            # Perform standard least-squares linear regression
            #myreg = linregress(my_x,my_y)
            myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
            locusadder.at[0,'abc-original_slope'] = myreg[0]
            locusadder.at[0,'abc-original_intercept'] = myreg[1]
            myreg = yorkreg_nocorr(my_x,my_y-my_x,[0]*len(my_x),[0]*len(my_y),yorkn)
            locusadder.at[0,'deltas-original_slope'] = myreg[0]
            locusadder.at[0,'deltas-original_intercept'] = myreg[1]
            myreg = yorkreg_nocorr(my_y,my_x-my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
            locusadder.at[0,'deltas-reversion_slope'] = myreg[0]
            locusadder.at[0,'deltas-reversion_intercept'] = myreg[1]
            #locusadder.at[0,'linreg r2'] = myreg.rvalue**2
            #locusadder.at[0,'stderr_slope'] = myreg.stderr
            #locusadder.at[0,'stderr_intercept'] = myreg.intercept_stderr
            # Compare this slope to 1
            #myt = (myreg.slope - 1) / (myreg.stderr - 0)
            #locusadder.at[0,'t_stat'] = myt
            #mydf = len(templ) - 2
            # two-sided t test p value
            #myp = stats.t.sf(np.abs(myt), mydf)*2
            #locusadder.at[0,'p_val'] = myp
            
            #plt.scatter(my_x,my_y)
            #plt.show()
            
            # Begin cycles
            data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[floci[l]] == 1)]
            data0adder['abs coefficient'] = abs(data0adder[ploidies[p]+'_'+envts[e]+'_term'])
            data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)
            
            for n in np.arange(ncyc):
                # add the strongest epistatic coefficient that involves locus l
                topind = data0adder.index[0]
                builder.at[topind,ploidies[p]+'_'+envts[e]+'_term'] = data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term']
                
                # log coefficient info in locusadder
                locusadder.at[n+1,'num term add'] = n+1
                locusadder.at[n+1,'term added'] = data0adder.loc[topind,'genotype'] 
                locusadder.at[n+1,'term order'] = data0adder.loc[topind,'numMut']
                locusadder.at[n+1,'coefficient'] = data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term']
                
                # Estimate the new predicted value for each genotype
                for g in np.arange(len(data1)):
                    temp = builder.copy(deep=True)
                    # to make life faster, remove all zero values
                    temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                    for t in np.arange(len(temp)):
                        temprowlist = []
                        for locus in np.arange(len(floci)):
                            temprowlist = temprowlist + [temp.loc[t,floci[locus]]*data1.loc[g,floci[locus]]]
                        #remove zeros
                        temprowlist = [value for value in temprowlist if value != 0]
                        # find product
                        firstprod = np.prod(temprowlist)
                        # subtract out baseline
                        firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                        # multiply by term's value
                        mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                        temp.at[t,'tosum'] = mytermval
                    totalsum = temp['tosum'].sum()
                    myintercept = data1.loc[(data1['genotype'] == '0000000000'),ploidies[p]+'_'+envts[e]+'_'+'Alex prediction'].values[0]
                    data1.at[g,'new_pred'] = totalsum + myintercept
                    
                # For the new predictions, get the values I want
                tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
                tab0 = tab0[['genotype','new_pred','without_'+floci[l]]]
                tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
                tab1 = tab1[['genotype','new_pred','without_'+floci[l]]]

                templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

                for i in np.arange(len(floci)):
                    templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

                # Get the 1:1 R^2 value
                my_x = templ['new_pred_x']
                my_y = templ['new_pred_y']
                #my_y_mean = np.mean(my_y)
                #my_x_mean = np.mean(my_x)
                #TSS = np.sum((my_y - my_y_mean)**2)
                #RSS = np.sum((my_y - my_y_mean - my_x + my_x_mean)**2)
                
                #locusadder.at[n+1,'1:1 R2'] = 1-RSS/TSS

                # Perform standard least-squares linear regression
                #myreg = linregress(my_x,my_y)
                myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
                locusadder.at[n+1,'abc-original_slope'] = myreg[0]
                locusadder.at[n+1,'abc-original_intercept'] = myreg[1]
                myreg = yorkreg_nocorr(my_x,my_y-my_x,[0]*len(my_x),[0]*len(my_y),yorkn)
                locusadder.at[n+1,'deltas-original_slope'] = myreg[0]
                locusadder.at[n+1,'deltas-original_intercept'] = myreg[1]
                myreg = yorkreg_nocorr(my_y,my_x-my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
                locusadder.at[n+1,'deltas-reversion_slope'] = myreg[0]
                locusadder.at[n+1,'deltas-reversion_intercept'] = myreg[1]
                #locusadder.at[n+1,'slope'] = myreg.slope
                #locusadder.at[n+1,'intercept'] = myreg.intercept
                #locusadder.at[n+1,'linreg r2'] = myreg.rvalue**2
                #locusadder.at[n+1,'stderr_slope'] = myreg.stderr
                #locusadder.at[n+1,'stderr_intercept'] = myreg.intercept_stderr
                # Compare this slope to 1
                #myt = (myreg.slope - 1) / (myreg.stderr - 0)
                #locusadder.at[n+1,'t_stat'] = myt
                #mydf = len(templ) - 2
                # two-sided t test p value
                #myp = stats.t.sf(np.abs(myt), mydf)*2
                #locusadder.at[n+1,'p_val'] = myp
                
                # Remove strongest epistatic coefficient from data0adder
                data0adder = data0adder[1:]
                
                #plt.scatter(my_x,my_y)
                #plt.plot(np.linspace(-0.4,0.4),np.linspace(-0.4,0.4))
                #plt.plot(np.linspace(-0.4,0.4),np.linspace(-0.4,0.4)*myreg[0]+myreg[1])
                #plt.show()
                
                
                if n >= ncycthresh:
                    diff3 = abs(locusadder.loc[n+1,'abc-original_slope'] - locusadder.loc[n-2,'abc-original_slope'])
                    diff2 = abs(locusadder.loc[n+1,'abc-original_slope'] - locusadder.loc[n-1,'abc-original_slope'])
                    diff1 = abs(locusadder.loc[n+1,'abc-original_slope'] - locusadder.loc[n,'abc-original_slope'])
                    
                    if diff3 <= diffthresh and diff2 <= diffthresh and diff1 <= diffthresh:
                        diff3do = abs(locusadder.loc[n+1,'deltas-original_slope'] - locusadder.loc[n-2,'deltas-original_slope'])
                        diff2do = abs(locusadder.loc[n+1,'deltas-original_slope'] - locusadder.loc[n-1,'deltas-original_slope'])
                        diff1do = abs(locusadder.loc[n+1,'deltas-original_slope'] - locusadder.loc[n,'deltas-original_slope'])
                        
                        if diff3do <= diffthresh and diff2do <= diffthresh and diff1do <= diffthresh:
                            diff3dr = abs(locusadder.loc[n+1,'deltas-reversion_slope'] - locusadder.loc[n-2,'deltas-reversion_slope'])
                            diff2dr = abs(locusadder.loc[n+1,'deltas-reversion_slope'] - locusadder.loc[n-1,'deltas-reversion_slope'])
                            diff1dr = abs(locusadder.loc[n+1,'deltas-reversion_slope'] - locusadder.loc[n,'deltas-reversion_slope'])
                            
                            if diff3dr <= diffthresh and diff2dr <= diffthresh and diff1dr <= diffthresh:
                                break
 
            locusadder.at[1:,'term added'] = locusadder.loc[1:,'term added'].astype(int).astype(str).str.zfill(10)
            locusadder.insert(0,'main locus', floci[l])
            locusadder.insert(0,'envt',envts[e])
            locusadder.insert(0,'ploidy',ploidies[p])
            
            megaadder = megaadder.append(locusadder)
            print(floci[l])

mystop = time.perf_counter()
elapsed = mystop-mystart
print(str(elapsed)+' seconds to run')
                
#export_csv = megaadder.to_csv(r'20210324_megaadder_haphom_withdeltas.csv',index=True,header=True) 

In [None]:
# Find the full-data prediction slopes
gowith = ['lasso_v2',11]
# Get the data
o1tab = pd.DataFrame()

myindo1 = 0

o = 10

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):    
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):

        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        # Since we're just using the predicted s, remove the obs columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)

        # Create a column for each locus, and for the genotype with that locus removed
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l]
        for l in np.arange(len(floci)):
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        # So we can plot in different ways, import the background-averaged fitness effects of each mutation
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(gowith[1]-1)+'.txt',
                                     sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(gowith[1]-1),'na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)

        data0['numMut'] = data0[floci].sum(axis=1)

        #data0 = data0.loc[(data0['numMut'] == 1)].sort_values(by='genotype',ascending=False).reset_index(drop=True)

        mysize = 15
        myalpha = 0.5

        # Divvy up the data by focal locus
        #for l in np.arange(2,3):
        for l in np.arange(len(floci)):

            tab0 = data1.loc[(data1[floci[l]] == '0')].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),'without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == '1')].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype',ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o),'without_'+floci[l]]]

            temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])
            
            for i in np.arange(len(floci)):
                temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)
            
            temp['∆s(y-x)'] = temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'] - temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x']

            # Perform a linear regression, capturing slope and intercept
            yorkn = 100
            #res = linregress(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'])
            res = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'],[0]*len(temp),[0]*len(temp),yorkn)
            o1tab.at[myindo1,'ploidy'] = ploidies[p]
            o1tab.at[myindo1,'envt'] = envts[e]
            o1tab.at[myindo1,'main locus'] = floci[l]
            o1tab.at[myindo1,'fullmodel_slope'] = res[0]
            #o1tab.at[myindo1,'intercept'] = res[1]
            #o1tab.at[myindo1,'additive_s'] = data0.loc[(data0[floci[l]] == 1)&(data0['numMut'] == 1),ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)].values[0]
            #o1tab.at[myindo1,'epmagsum'] = abs(data0.loc[(data0[floci[l]] == 1)&(data0['numMut'] > 1),ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)]).sum()
            
            #resdo = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x'],temp['∆s(y-x)'],[0]*len(temp),[0]*len(temp),yorkn)
            #resdr = yorkreg_nocorr(temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'],-1*temp['∆s(y-x)'],[0]*len(temp),[0]*len(temp),yorkn)
            #o1tab.at[myindo1,'delta_s-slope_original']  = resdo[0]
            #o1tab.at[myindo1,'delta_s-slope_reversion']  = resdr[0]
            
            if abs(res[0]) > 1:
                o1tab.at[myindo1,'fullmodel_slope_-1to1'] = 1/res[0]
            else:
                o1tab.at[myindo1,'fullmodel_slope_-1to1'] = res[0]
            
            
            myindo1 = myindo1 + 1

In [None]:
# fullmodel_slope_-1to1
# Do the same as above but use the fullmodel_slope_-1to1 instead of the convergence slope

# Analyze the megaadder
ma = pd.read_csv('20210324_megaadder_haphom_withdeltas.csv')
ma = ma.drop(columns=['Unnamed: 0'])

#ma = megaadder.copy(deep=True).reset_index(drop=True)
diffthresh = 0.01
yorkn = 100

# Create a modified slope column that takes the reciprocal if abs(abc slope) > 1
for i in np.arange(len(ma)):
    if abs(ma.loc[i,'abc-original_slope']) > 1:
        ma.at[i,'abc-original_slope_-1to1'] = 1/ma.loc[i,'abc-original_slope']
    else:
        ma.at[i,'abc-original_slope_-1to1'] = ma.loc[i,'abc-original_slope']

# Now create a column that says whether a given locus is a turning point.
# Create a second version of dataframe, ma2b, which doesn't contain any rows beyond the turning point.
ma2 = pd.DataFrame()
ma2b = pd.DataFrame()
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        for l in np.arange(len(floci)):
            temp = ma.loc[(ma['ploidy'] == ploidies[p])&(ma['envt'] == envts[e])&(ma['main locus'] == floci[l])].copy(deep=True).reset_index(drop=True)
            for i in np.arange(len(temp)):
                if i > 2:
                    diff1 = abs(temp.loc[i,'abc-original_slope_-1to1'] - temp.loc[i-1,'abc-original_slope_-1to1'])
                    diff2 = abs(temp.loc[i,'abc-original_slope_-1to1'] - temp.loc[i-2,'abc-original_slope_-1to1'])
                    diff3 = abs(temp.loc[i,'abc-original_slope_-1to1'] - temp.loc[i-3,'abc-original_slope_-1to1'])
                    if diff1 <= diffthresh and diff2 <= diffthresh and diff3 <= diffthresh:
                        temp.at[i-3,'d(ds/ds) < diffthresh'] = 'y'
                        temp.at[i-3,'numterms to d(ds/ds) < diffthresh'] = i-3
                        temp.at[i-3,'sumtermscoeff to d(ds/ds) < diffthresh'] = temp.loc[0:i-3,'coefficient'].sum()
                        break
                        
            ma2 = ma2.append(temp,sort=False)
            ma2b=ma2b.append(temp.loc[0:i-3])

# Condense ma2 to the most relevant rows
ma3 = ma2.loc[ma2['d(ds/ds) < diffthresh'] == 'y'].reset_index(drop=True)
ma3 = pd.merge(ma3,o1tab,how='left',on=['ploidy','envt','main locus'])

# create a column comparing the d(ds/ds) < diffthresh slope to the full model slope
ma3['abs(diff): abc-original_full vs final slope'] = abs(ma3['fullmodel_slope_-1to1'] - ma3['abc-original_slope_-1to1'])

#look at the differences
#ma3.sort_values(by='abs(diff): full vs final slope',ascending=False)

# remove extraneous columns
ma3 = ma3.drop(columns=['num term add','term added','term order','coefficient','abc-original_intercept'])

sthresh = 0.9
mybins = list(np.arange(0,13))

# Plot histogram for slopes <= sthresh
ma3t = ma3.loc[ma3['fullmodel_slope_-1to1'] <= sthresh].copy(deep=True).reset_index(drop=True)
ma3supert = ma3.loc[ma3['fullmodel_slope_-1to1'] > sthresh].copy(deep=True).reset_index(drop=True)

# DEFINE FIGURE
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(1.27,1.15*1.2), gridspec_kw={'width_ratios': [2.9,1]},sharey=True)
fig.subplots_adjust(wspace=0.3)
for i in np.arange(len(ma3)):
    if ma3.loc[i,'fullmodel_slope_-1to1'] <= sthresh:
        axes[0].scatter(ma3.loc[i,'fullmodel_slope_-1to1'],ma3.loc[i,'numterms to d(ds/ds) < diffthresh'],alpha=0.5,s=3,color='xkcd:orange')
    else:
        axes[0].scatter(ma3.loc[i,'fullmodel_slope_-1to1'],ma3.loc[i,'numterms to d(ds/ds) < diffthresh'],alpha=0.5,s=3,color='xkcd:cerulean')
#axes[0].scatter(ma3['abc-original_slope_-1to1'],ma3['numterms to d(ds/ds) < diffthresh'],alpha=0.5,s=5)
#axes[0].set_ylim(-0.25,1.04)
axes[0].axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].set_xlabel('\n'.join(wrap('full model regression slope, $b$',11)))
#axes[0].set_ylabel('\n'.join(wrap('number of terms necessary',15)))
axes[0].set_ylabel('\n'.join(wrap('# terms sufficient to reach global $b$',19)),labelpad=0.2)
axes[0].set_yticks([0,2,4,6,8,10,12])

axes[1].hist(ma3t['numterms to d(ds/ds) < diffthresh'],bins=mybins,orientation='horizontal',color='xkcd:orange',alpha=0.7)
axes[1].hist(ma3supert['numterms to d(ds/ds) < diffthresh'],bins=mybins,orientation='horizontal',color='xkcd:cerulean',zorder=0,alpha=0.7)
axes[1].set_xlabel('count')
axes[1].axhline(y=ma3t['numterms to d(ds/ds) < diffthresh'].mean(),color='xkcd:dark orange',lw=1)
axes[1].axhline(y=ma3supert['numterms to d(ds/ds) < diffthresh'].mean(),color='xkcd:blue',lw=1)
right_side = axes[1].spines["right"]
top_side = axes[1].spines["top"]
bottom_side = axes[1].spines["bottom"]
right_side.set_visible(False)
top_side.set_visible(False)
bottom_side.set_visible(False)

dot1 = Line2D([], [], color='xkcd:orange', marker='.', linestyle='None',
                          markersize=5, label='$b$ ≤ 0.9')
dot2 = Line2D([], [], color='xkcd:cerulean', marker='.', linestyle='None',
                          markersize=5, label='0.9 < $b$ ≤ 1')

fig.legend(handles=[dot1,dot2],handletextpad=-0.4,borderpad=0.2,labelspacing=0.1,loc='upper left',bbox_to_anchor=(0.33,1.44),frameon=False)

fig.savefig('msfigs/Fig3/slopesNumterms_moreinhistbutnottoomuch_v4.pdf',bbox_inches='tight',dpi=1000)

plt.show()


In [None]:
# Now I want to look at the % of total non-zero terms interacting with the focal locus that are added
# Also want to look at what % of interaction-term variance that represents

# Analyze the megaadder
ma = pd.read_csv('20210324_megaadder_haphom_withdeltas.csv')
ma = ma.drop(columns=['Unnamed: 0'])


# add variance explained
ma['varexp'] = ma['coefficient']**2

# add cumulative variance
ma1 = pd.DataFrame()
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        for l in np.arange(len(floci)):
            temp = ma.loc[(ma['ploidy'] == ploidies[p])&(ma['envt'] == envts[e])&(ma['main locus'] == floci[l])].reset_index(drop=True)
            my_b = 0
            temp.at[0,'varexp_cum'] = my_b
            for i in np.arange(1,len(temp)):
                my_b = my_b + temp.loc[i,'varexp']
                temp.at[i,'varexp_cum'] = my_b
            ma1 = ma1.append(temp)
ma = ma1.reset_index(drop=True)


#ma = megaadder.copy(deep=True).reset_index(drop=True)
diffthresh = 0.01
yorkn = 100

# Create a modified slope column that takes the reciprocal if abs(abc slope) > 1
for i in np.arange(len(ma)):
    if abs(ma.loc[i,'abc-original_slope']) > 1:
        ma.at[i,'abc-original_slope_-1to1'] = 1/ma.loc[i,'abc-original_slope']
    else:
        ma.at[i,'abc-original_slope_-1to1'] = ma.loc[i,'abc-original_slope']

# Now create a column that says whether a given locus is a turning point.
# Create a second version of dataframe, ma2b, which doesn't contain any rows beyond the turning point.
ma2 = pd.DataFrame()
ma2b = pd.DataFrame()
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        for l in np.arange(len(floci)):
            temp = ma.loc[(ma['ploidy'] == ploidies[p])&(ma['envt'] == envts[e])&(ma['main locus'] == floci[l])].copy(deep=True).reset_index(drop=True)
            for i in np.arange(len(temp)):
                if i > 2:
                    diff1 = abs(temp.loc[i,'abc-original_slope_-1to1'] - temp.loc[i-1,'abc-original_slope_-1to1'])
                    diff2 = abs(temp.loc[i,'abc-original_slope_-1to1'] - temp.loc[i-2,'abc-original_slope_-1to1'])
                    diff3 = abs(temp.loc[i,'abc-original_slope_-1to1'] - temp.loc[i-3,'abc-original_slope_-1to1'])
                    if diff1 <= diffthresh and diff2 <= diffthresh and diff3 <= diffthresh:
                        temp.at[i-3,'d(ds/ds) < diffthresh'] = 'y'
                        temp.at[i-3,'numterms to d(ds/ds) < diffthresh'] = i-3
                        temp.at[i-3,'sumtermscoeff to d(ds/ds) < diffthresh'] = temp.loc[0:i-3,'coefficient'].sum()
                        break
                        
            ma2 = ma2.append(temp,sort=False)
            ma2b=ma2b.append(temp.loc[0:i-3])

# Condense ma2 to the most relevant rows
ma3 = ma2.loc[ma2['d(ds/ds) < diffthresh'] == 'y'].reset_index(drop=True)
ma3 = pd.merge(ma3,o1tab,how='left',on=['ploidy','envt','main locus'])

# create a column comparing the d(ds/ds) < diffthresh slope to the full model slope
ma3['abs(diff): abc-original_full vs final slope'] = abs(ma3['fullmodel_slope_-1to1'] - ma3['abc-original_slope_-1to1'])

# For each row in ma3, add in the # of non-zero terms that COULD have been added, as well as the total variance that COULD have been added
for p in np.arange(len(ploidies)):    
    for e in np.arange(len(envts)):
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/'+gowith[0]+'_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(gowith[1]-1)+'.txt',
                                     sep='\t',names=['todelete','genotype','coeff','na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)

        data0['numMut'] = data0[floci].sum(axis=1)
        
        for l in np.arange(len(floci)):
            reltab = data0.loc[(data0['numMut'] > 1)&(data0[floci[l]] == 1)]
            totalvarexp = (reltab['coeff']**2).sum()
            totalnonzeroterms = len(reltab.loc[reltab['coeff'] != 0])
            ma3.at[(ma3['ploidy'] == ploidies[p])&(ma3['envt'] == envts[e])&(ma3['main locus'] == floci[l]),'total potentially added nonzero terms'] = totalnonzeroterms
            ma3.at[(ma3['ploidy'] == ploidies[p])&(ma3['envt'] == envts[e])&(ma3['main locus'] == floci[l]),'total potentially added variance'] = totalvarexp

# add % potential variance explained of the added terms
ma3['% potential terms'] = ma3['numterms to d(ds/ds) < diffthresh']/ma3['total potentially added nonzero terms']
ma3['% potential varexp'] = ma3['varexp_cum']/ma3['total potentially added variance']
ma3['convergence - full model -1 to 1 slopes'] = ma3['abc-original_slope_-1to1'] - ma3['fullmodel_slope_-1to1']

sthresh = 0.9

mybins = np.arange(0,0.51,0.02)

ma3t = ma3.loc[ma3['abc-original_slope_-1to1'] <= sthresh].copy(deep=True).reset_index(drop=True)
ma3supert = ma3.loc[ma3['abc-original_slope_-1to1'] > sthresh].copy(deep=True).reset_index(drop=True)

# DEFINE % TERMS FIGURE
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(1.2,1.15*1.2), gridspec_kw={'width_ratios': [2.9,1]},sharey=True)
fig.subplots_adjust(wspace=0.3)
for i in np.arange(len(ma3)):
    if ma3.loc[i,'abc-original_slope_-1to1'] <= sthresh:
        axes[0].scatter(ma3.loc[i,'abc-original_slope_-1to1'],ma3.loc[i,'% potential terms'],alpha=0.5,s=3,color='xkcd:orange')
    else:
        axes[0].scatter(ma3.loc[i,'abc-original_slope_-1to1'],ma3.loc[i,'% potential terms'],alpha=0.5,s=3,color='xkcd:cerulean')
#axes[0].scatter(ma3['abc-original_slope_-1to1'],ma3['numterms to d(ds/ds) < diffthresh'],alpha=0.5,s=5)
#axes[0].set_ylim(-0.25,1.04)
axes[0].axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].set_xlabel('\n'.join(wrap('full model regression slope, b',11)))
#axes[0].set_ylabel('\n'.join(wrap('number of terms necessary',15)))
axes[0].set_ylabel('Fraction terms necessary')
axes[0].set_yticks(np.arange(0,0.51,0.1))

axes[1].hist(ma3t['% potential terms'],orientation='horizontal',color='xkcd:orange',alpha=0.7,bins=mybins)
axes[1].hist(ma3supert['% potential terms'],orientation='horizontal',color='xkcd:cerulean',zorder=0,alpha=0.7,bins=mybins)
axes[1].set_xlabel('count')
axes[1].axhline(y=ma3t['% potential terms'].mean(),color='xkcd:dark orange',lw=1)
axes[1].axhline(y=ma3supert['% potential terms'].mean(),color='xkcd:blue',lw=1)
right_side = axes[1].spines["right"]
top_side = axes[1].spines["top"]
bottom_side = axes[1].spines["bottom"]
right_side.set_visible(False)
top_side.set_visible(False)
bottom_side.set_visible(False)

dot1 = Line2D([], [], color='xkcd:orange', marker='.', linestyle='None',
                          markersize=5, label='b ≤ 0.9')
dot2 = Line2D([], [], color='xkcd:cerulean', marker='.', linestyle='None',
                          markersize=5, label='0.9 < b ≤ 1')

fig.legend(handles=[dot1,dot2],handletextpad=-0.4,borderpad=0.2,labelspacing=0.1,loc='upper left',bbox_to_anchor=(0.25,1.41),frameon=False)

fig.savefig('msfigs/Fig3/slopesNumterms_moreinhistbutnottoomuch_v3_percentterms.pdf',bbox_inches='tight',dpi=1000)

plt.show()

mybins = np.arange(0,1.01,0.02)

# DEFINE FIGURE, NOW FOR % VARIANCE
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(1.2,1.15*1.2), gridspec_kw={'width_ratios': [2.9,1]},sharey=True)
fig.subplots_adjust(wspace=0.3)
for i in np.arange(len(ma3)):
    if ma3.loc[i,'abc-original_slope_-1to1'] <= sthresh:
        axes[0].scatter(ma3.loc[i,'abc-original_slope_-1to1'],ma3.loc[i,'% potential varexp'],alpha=0.5,s=3,color='xkcd:orange')
    else:
        axes[0].scatter(ma3.loc[i,'abc-original_slope_-1to1'],ma3.loc[i,'% potential varexp'],alpha=0.5,s=3,color='xkcd:cerulean')
#axes[0].scatter(ma3['abc-original_slope_-1to1'],ma3['numterms to d(ds/ds) < diffthresh'],alpha=0.5,s=5)
#axes[0].set_ylim(-0.25,1.04)
axes[0].axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].set_xlabel('\n'.join(wrap('full model regression slope, b',11)))
#axes[0].set_ylabel('\n'.join(wrap('number of terms necessary',15)))
axes[0].set_ylabel('Fraction variance necessary')
axes[0].set_yticks(np.arange(0,1.01,0.2))

axes[1].hist(ma3t['% potential varexp'],orientation='horizontal',color='xkcd:orange',alpha=0.7,bins=mybins)
axes[1].hist(ma3supert['% potential varexp'],orientation='horizontal',color='xkcd:cerulean',zorder=0,alpha=0.7,bins=mybins)
axes[1].set_xlabel('count')
axes[1].axhline(y=ma3t['% potential varexp'].mean(),color='xkcd:dark orange',lw=1)
axes[1].axhline(y=ma3supert['% potential varexp'].mean(),color='xkcd:blue',lw=1)
right_side = axes[1].spines["right"]
top_side = axes[1].spines["top"]
bottom_side = axes[1].spines["bottom"]
right_side.set_visible(False)
top_side.set_visible(False)
bottom_side.set_visible(False)

dot1 = Line2D([], [], color='xkcd:orange', marker='.', linestyle='None',
                          markersize=5, label='b ≤ 0.9')
dot2 = Line2D([], [], color='xkcd:cerulean', marker='.', linestyle='None',
                          markersize=5, label='0.9 < b ≤ 1')

fig.legend(handles=[dot1,dot2],handletextpad=-0.4,borderpad=0.2,labelspacing=0.1,loc='upper left',bbox_to_anchor=(0.25,1.41),frameon=False)

fig.savefig('msfigs/Fig3/slopesNumterms_moreinhistbutnottoomuch_v3_percentvarexp.pdf',bbox_inches='tight',dpi=1000)

plt.show()


In [None]:
# Make the above figures but in style identical to what appears in 3G

# DEFINE FIGURE, % terms
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(1.27,1.15*1.2), gridspec_kw={'width_ratios': [2.9,1]},sharey=True)
fig.subplots_adjust(wspace=0.3)
for i in np.arange(len(ma3)):
    if ma3.loc[i,'fullmodel_slope_-1to1'] <= sthresh:
        axes[0].scatter(ma3.loc[i,'fullmodel_slope_-1to1'],ma3.loc[i,'% potential terms'],alpha=0.5,s=3,color='xkcd:orange')
    else:
        axes[0].scatter(ma3.loc[i,'fullmodel_slope_-1to1'],ma3.loc[i,'% potential terms'],alpha=0.5,s=3,color='xkcd:cerulean')
axes[0].axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].set_xlabel('\n'.join(wrap('full model regression slope, $b$',11)))
axes[0].set_ylabel('\n'.join(wrap('% potential terms sufficient to reach global $b$',30)),labelpad=0.2)
axes[0].set_yticks(np.arange(0,0.51,0.1))
axes[0].set_yticklabels([0,10,20,30,40,50])
axes[0].set_ylim(-0.05,0.55)

axes[1].hist(ma3t['% potential terms'],bins=mybins,orientation='horizontal',color='xkcd:orange',alpha=0.7)
axes[1].hist(ma3supert['% potential terms'],bins=mybins,orientation='horizontal',color='xkcd:cerulean',zorder=0,alpha=0.7)
axes[1].set_xlabel('count')
axes[1].axhline(y=ma3t['% potential terms'].mean(),color='xkcd:dark orange',lw=1)
axes[1].axhline(y=ma3supert['% potential terms'].mean(),color='xkcd:blue',lw=1)
right_side = axes[1].spines["right"]
top_side = axes[1].spines["top"]
bottom_side = axes[1].spines["bottom"]
right_side.set_visible(False)
top_side.set_visible(False)
bottom_side.set_visible(False)

dot1 = Line2D([], [], color='xkcd:orange', marker='.', linestyle='None',
                          markersize=5, label='$b$ ≤ 0.9')
dot2 = Line2D([], [], color='xkcd:cerulean', marker='.', linestyle='None',
                          markersize=5, label='0.9 < $b$ ≤ 1')

fig.legend(handles=[dot1,dot2],handletextpad=-0.4,borderpad=0.2,labelspacing=0.1,loc='upper left',bbox_to_anchor=(0.33,1.44),frameon=False)

fig.savefig('msfigs/Fig3/slopesNumterms_moreinhistbutnottoomuch_v4_percentterms.pdf',bbox_inches='tight',dpi=1000)

plt.show()

# DEFINE FIGURE, % variance explained
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(1.27,1.15*1.2), gridspec_kw={'width_ratios': [2.9,1]},sharey=True)
fig.subplots_adjust(wspace=0.3)
for i in np.arange(len(ma3)):
    if ma3.loc[i,'fullmodel_slope_-1to1'] <= sthresh:
        axes[0].scatter(ma3.loc[i,'fullmodel_slope_-1to1'],ma3.loc[i,'% potential varexp'],alpha=0.5,s=3,color='xkcd:orange')
    else:
        axes[0].scatter(ma3.loc[i,'fullmodel_slope_-1to1'],ma3.loc[i,'% potential varexp'],alpha=0.5,s=3,color='xkcd:cerulean')
axes[0].axvline(x=0,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].axvline(x=1,color='xkcd:grey',zorder=0,lw=0.5)
axes[0].set_xlabel('\n'.join(wrap('full model regression slope, $b$',11)))
axes[0].set_ylabel('\n'.join(wrap('% potential epistatic variance sufficient to reach global $b$',30)),labelpad=0.2)
axes[0].set_yticks(np.arange(0,1.01,0.2))
axes[0].set_yticklabels([0,20,40,60,80,100])
axes[0].set_ylim(-0.05,1.05)

axes[1].hist(ma3t['% potential varexp'],bins=mybins,orientation='horizontal',color='xkcd:orange',alpha=0.7)
axes[1].hist(ma3supert['% potential varexp'],bins=mybins,orientation='horizontal',color='xkcd:cerulean',zorder=0,alpha=0.7)
axes[1].set_xlabel('count')
axes[1].axhline(y=ma3t['% potential varexp'].mean(),color='xkcd:dark orange',lw=1)
axes[1].axhline(y=ma3supert['% potential varexp'].mean(),color='xkcd:blue',lw=1)
right_side = axes[1].spines["right"]
top_side = axes[1].spines["top"]
bottom_side = axes[1].spines["bottom"]
right_side.set_visible(False)
top_side.set_visible(False)
bottom_side.set_visible(False)

dot1 = Line2D([], [], color='xkcd:orange', marker='.', linestyle='None',
                          markersize=5, label='$b$ ≤ 0.9')
dot2 = Line2D([], [], color='xkcd:cerulean', marker='.', linestyle='None',
                          markersize=5, label='0.9 < $b$ ≤ 1')

fig.legend(handles=[dot1,dot2],handletextpad=-0.4,borderpad=0.2,labelspacing=0.1,loc='upper left',bbox_to_anchor=(0.33,1.44),frameon=False)

fig.savefig('msfigs/Fig3/slopesNumterms_moreinhistbutnottoomuch_v4_percentvarexp.pdf',bbox_inches='tight',dpi=1000)

plt.show()

## Do the proper removal analysis (2021.07.07)

In [None]:
# Test this analysis with JUST predicted values
# Want to do ABC vs aBC plots now where we subtract terms by rank from predicted values

#time the code
mystart = time.perf_counter()

megaremoverv2 = pd.DataFrame()
o=10
ncyc = 30
ncycthresh = 10
#pthresh = 0.01
yorkn = 100
#diffthresh = 0.01
ethresh = 0.50

for p in np.arange(0,1):
#for p in np.arange(len(ploidies)):
    for e in np.arange(3,4):
    #for e in np.arange(len(envts)):
        
        print(ploidies[p]+'_'+envts[e])
        
        # First, import a list of genotypes to predict
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype','s',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]
        
        # Remove observed data and duplicates
        data1 = data1[['genotype','s']].copy(deep=True).drop_duplicates().reset_index(drop=True)
        
        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)
        
        # Create a column for each locus
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
            data1.loc[(data1[floci[l]] == 0),floci[l]] = -1
        
        # Now import a set of coefficients
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                                             sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_term','na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        data0['numMut'] = data0[floci].sum(axis=1)
        
        # Add a "baseline" column for whether these terms are added or subtracted in the WT
        for i in np.arange(len(data0)):
            if data0.loc[i,'numMut'] % 2 == 1:
                data0.at[i,'baseline'] = -1
            else:
                data0.at[i,'baseline'] = 1
        
        for l in np.arange(4,5):
        #for l in np.arange(len(floci)):
        
            # Want to start with "all-epistasis" observed fitnesses in data1
            # Get b_obs, TLS_b_obs, TLS_1 (and intercept values)
            tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','without_'+floci[l]]]

            templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

            for i in np.arange(len(floci)):
                templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)
            
            # York regression to get sum of total least squares deviations, S
            myreg = yorkreg_nocorr(templ['s_x'],templ['s_y'],len(templ)*[0],len(templ)*[0],yorkn)
            myreg1 = york_slope1(templ['s_x'],templ['s_y'],len(templ)*[0],len(templ)*[0],yorkn)
            
            plt.scatter(templ['s_x'],templ['s_y'],alpha=0.7,linestyle='None')
            xs = plt.gca().get_xlim()
            ys = plt.gca().get_ylim()
            mainx = np.linspace(-1,1)
            plt.plot(mainx,mainx*myreg[0]+myreg[1],color='xkcd:cerulean',zorder=0)
            plt.plot(mainx,mainx*1+myreg1[0],color='xkcd:orange',zorder=0)
            plt.plot(mainx,mainx,color='k',zorder=0)
            plt.xlim(xs)
            plt.ylim(ys)
            plt.show()
            
            # Create a subtable - we'll append these together into megaremoverv2
            locusremover = pd.DataFrame()
            
            # wrap up the below, haven't edited yet...
            locusremover.at[0,'num term add'] = 0
            locusremover.at[0,'term added'] = 'na'
            locusremover.at[0,'term order'] = 'na'
            locusremover.at[0,'coefficient'] = 'na'
            locusremover.at[0,'inferred_b'] = myreg[0]
            locusremover.at[0,'inferred_a'] = myreg[1]
            locusremover.at[0,'inferred_S'] = myreg[3]
            locusremover.at[0,'main_b'] = myreg[0]
            locusremover.at[0,'main_a'] = myreg[1]
            locusremover.at[0,'main_S'] = myreg[3]
            locusremover.at[0,'N'] = len(templ)
            locusremover.at[0,'1_a'] = myreg1[0]
            locusremover.at[0,'1_S'] = myreg1[1]
            
            # OPTION TO CONSIDER: ADD IN ∆S FORMULATIONS, DIVERGENCE FROM SLOPE OF 0
            
            # Begin cycles
            # data0adder serves as the sorted databank for (non-zero) coefficients
            data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[floci[l]] == 1)]
            data0adder['abs coefficient'] = abs(data0adder[ploidies[p]+'_'+envts[e]+'_term'])
            data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)
            
            # create a builder dataframe on which changes will be processed
            builder = data0.copy(deep=True)
            
            for n in np.arange(ncyc):
            #for n in np.arange(0,1):
                # find the top index (i.e., the strongest coefficient)
                topind = data0adder.index[n]
                
                # remove the strongest epistatic coefficient that involves locus l
                builder.at[topind,ploidies[p]+'_'+envts[e]+'_term'] = 0
                
                # add coefficient info in locusadder
                locusremover.at[n+1,'num term add'] = n+1
                locusremover.at[n+1,'term added'] = data0adder.loc[topind,'genotype'] 
                locusremover.at[n+1,'term order'] = data0adder.loc[topind,'numMut']
                locusremover.at[n+1,'coefficient'] = data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term']
                
                # Estimate the new predicted value for each genotype
                for g in np.arange(len(data1)):
                    temp = builder.copy(deep=True)
                    # to make life faster, remove all zero values
                    temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                    for t in np.arange(len(temp)):
                        temprowlist = []
                        for locus in np.arange(len(floci)):
                            temprowlist = temprowlist + [temp.loc[t,floci[locus]]*data1.loc[g,floci[locus]]]
                        #remove zeros
                        temprowlist = [value for value in temprowlist if value != 0]
                        # find product
                        firstprod = np.prod(temprowlist)
                        # subtract out baseline
                        firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                        # multiply by term's value
                        mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                        temp.at[t,'tosum'] = mytermval
                    totalsum = temp['tosum'].sum()
                    myintercept = data1.loc[(data1['genotype'] == '0000000000'),'s'].values[0]
                    data1.at[g,'new_pred'] = totalsum + myintercept
                    
                # For the new predictions, get the values I want
                tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
                tab0 = tab0[['genotype','new_pred','without_'+floci[l]]]
                tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
                tab1 = tab1[['genotype','new_pred','without_'+floci[l]]]

                templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

                for i in np.arange(len(floci)):
                    templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

                my_x = templ['new_pred_x']
                my_y = templ['new_pred_y']
                my_x_err = len(templ)*[0]
                my_y_err = len(templ)*[0]

                myregnew = yorkreg_nocorr(my_x,my_y,my_x_err,my_y_err,yorkn)
                locusremover.at[n+1,'inferred_b'] = myregnew[0]
                locusremover.at[n+1,'inferred_a'] = myregnew[1]
                locusremover.at[n+1,'inferred_S'] = myregnew[3]
                
                myreg_main = york_slopeanyb(my_x,my_y,my_x_err,my_y_err,yorkn,myreg[0])
                locusremover.at[n+1,'main_b'] = myreg[0]
                locusremover.at[n+1,'main_a'] = myreg_main[0]
                locusremover.at[n+1,'main_S'] = myreg_main[1]
                
                myreg_1 = york_slope1(my_x,my_y,my_x_err,my_y_err,yorkn)
                locusremover.at[n+1,'1_a'] = myreg_1[0]
                locusremover.at[n+1,'1_S'] = myreg_1[1]
                
                locusremover.at[n+1,'N'] = len(templ)
                
                plt.scatter(my_x,my_y,alpha=0.7,linestyle='None')
                mainx = np.linspace(-1,1)
                plt.plot(mainx,mainx*myreg[0]+myreg_main[0],color='xkcd:cerulean',zorder=0)
                plt.plot(mainx,mainx*1+myreg_1[0],color='xkcd:orange',zorder=0)
                plt.plot(mainx,mainx*myregnew[0]+myregnew[1],color='xkcd:gold',zorder=0)
                plt.plot(mainx,mainx,color='k',zorder=0)
                plt.xlim(xs)
                plt.ylim(ys)
                plt.show()
                
                print(abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum())

                if n >= ncycthresh:
                    if (abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum()) > ethresh:
                        break
 
            locusremover.at[1:,'term added'] = locusremover.loc[1:,'term added'].astype(int).astype(str).str.zfill(10)
            locusremover.insert(0,'main locus', floci[l])
            locusremover.insert(0,'envt',envts[e])
            locusremover.insert(0,'ploidy',ploidies[p])
            
            megaremoverv2 = megaremoverv2.append(locusremover)
            print(floci[l])

mystop = time.perf_counter()
elapsed = mystop-mystart
print(str(elapsed)+' seconds to run')

#export_csv = megaremoverv2.to_csv(r'20210801_megaremoverv2_hap_salt_PMA1_pred.csv',index=True,header=True) 
#export_csv = megaremoverv2.to_csv(r'20210712_megaremoverv2_haphom.csv',index=True,header=True) 

In [None]:
# Want to do ABC vs aBC plots now where, instead of adding terms by rank to form predicted values,
# we subtract them by rank to adjust observed values.
TAKES SOMETHING LIKE 9H
#time the code
mystart = time.perf_counter()

megaremoverv2 = pd.DataFrame()
o=10
ncyc = 30
ncycthresh = 10
#pthresh = 0.01
yorkn = 100
#diffthresh = 0.01
ethresh = 0.50

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):
        
        print(ploidies[p]+'_'+envts[e])
        
        # First, import a list of genotypes to predict
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]
        
        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)
        
        # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
        # represented multiple times.
        glist = list(OrderedDict.fromkeys(data1['genotype']))

        data2 = pd.DataFrame()

        for g in np.arange(len(glist)):
            tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
            if len(tempg) == 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
                data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
            elif len(tempg) > 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
                #my_svar = statistics.variance(tempg['s'])
                my_svar = 0
                mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
                data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
            data2.at[g,'s_pred'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_Alex prediction']

        data1 = data2
        
        # Calculate the difference between observed and predicted, such that s_obs = s_pred + diff
        data1['opdiff'] = data1['s'] - data1['s_pred']

        # Create a column for each locus
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
            data1.loc[(data1[floci[l]] == 0),floci[l]] = -1
        
        # Now import a set of coefficients
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                                             sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_term','na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        data0['numMut'] = data0[floci].sum(axis=1)
        
        # Add a "baseline" column for whether these terms are added or subtracted in the WT
        for i in np.arange(len(data0)):
            if data0.loc[i,'numMut'] % 2 == 1:
                data0.at[i,'baseline'] = -1
            else:
                data0.at[i,'baseline'] = 1
        
        #for l in np.arange(2,3):
        for l in np.arange(len(floci)):
        
            # Want to start with "all-epistasis" observed fitnesses in data1
            # Get b_obs, TLS_b_obs, TLS_1 (and intercept values)
            tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l]]]

            templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

            for i in np.arange(len(floci)):
                templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)
            
            # York regression to get sum of total least squares deviations, S
            myreg = yorkreg_nocorr(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
            myreg1 = york_slope1(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
            
            # Create a subtable - we'll append these together into megaremoverv2
            locusremover = pd.DataFrame()
            
            # wrap up the below, haven't edited yet...
            locusremover.at[0,'num term add'] = 0
            locusremover.at[0,'term added'] = 'na'
            locusremover.at[0,'term order'] = 'na'
            locusremover.at[0,'coefficient'] = 'na'
            locusremover.at[0,'inferred_b'] = myreg[0]
            locusremover.at[0,'inferred_a'] = myreg[1]
            locusremover.at[0,'inferred_S'] = myreg[3]
            locusremover.at[0,'main_b'] = myreg[0]
            locusremover.at[0,'main_a'] = myreg[1]
            locusremover.at[0,'main_S'] = myreg[3]
            locusremover.at[0,'N'] = len(templ)
            locusremover.at[0,'1_a'] = myreg1[0]
            locusremover.at[0,'1_S'] = myreg1[1]
            
            # OPTION TO CONSIDER: ADD IN ∆S FORMULATIONS, DIVERGENCE FROM SLOPE OF 0
            
            # Begin cycles
            # data0adder serves as the sorted databank for (non-zero) coefficients
            data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[floci[l]] == 1)]
            data0adder['abs coefficient'] = abs(data0adder[ploidies[p]+'_'+envts[e]+'_term'])
            data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)
            
            # create a builder dataframe on which changes will be processed
            builder = data0.copy(deep=True)
            
            for n in np.arange(ncyc):
            #for n in np.arange(0,1):
                # find the top index (i.e., the strongest coefficient)
                topind = data0adder.index[n]
                
                # remove the strongest epistatic coefficient that involves locus l
                builder.at[topind,ploidies[p]+'_'+envts[e]+'_term'] = 0
                
                # add coefficient info in locusadder
                locusremover.at[n+1,'num term add'] = n+1
                locusremover.at[n+1,'term added'] = data0adder.loc[topind,'genotype'] 
                locusremover.at[n+1,'term order'] = data0adder.loc[topind,'numMut']
                locusremover.at[n+1,'coefficient'] = data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term']
                
                # Estimate the new predicted value for each genotype
                for g in np.arange(len(data1)):
                    temp = builder.copy(deep=True)
                    # to make life faster, remove all zero values
                    temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                    for t in np.arange(len(temp)):
                        temprowlist = []
                        for locus in np.arange(len(floci)):
                            temprowlist = temprowlist + [temp.loc[t,floci[locus]]*data1.loc[g,floci[locus]]]
                        #remove zeros
                        temprowlist = [value for value in temprowlist if value != 0]
                        # find product
                        firstprod = np.prod(temprowlist)
                        # subtract out baseline
                        firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                        # multiply by term's value
                        mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                        temp.at[t,'tosum'] = mytermval
                    totalsum = temp['tosum'].sum()
                    myintercept = data1.loc[(data1['genotype'] == '0000000000'),'s_pred'].values[0]
                    data1.at[g,'new_pred'] = totalsum + myintercept
                data1['new_obs'] = data1['new_pred'] + data1['opdiff']
                    
                # For the new predictions, get the values I want
                tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
                tab0 = tab0[['genotype','new_obs','stderr(s)','without_'+floci[l]]]
                tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
                tab1 = tab1[['genotype','new_obs','stderr(s)','without_'+floci[l]]]

                templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

                for i in np.arange(len(floci)):
                    templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

                my_x = templ['new_obs_x']
                my_y = templ['new_obs_y']
                my_x_err = templ['stderr(s)_x']
                my_y_err = templ['stderr(s)_y']

                myregnew = yorkreg_nocorr(my_x,my_y,my_x_err,my_y_err,yorkn)
                locusremover.at[n+1,'inferred_b'] = myregnew[0]
                locusremover.at[n+1,'inferred_a'] = myregnew[1]
                locusremover.at[n+1,'inferred_S'] = myregnew[3]
                
                myreg_main = york_slopeanyb(my_x,my_y,my_x_err,my_y_err,yorkn,myreg[0])
                locusremover.at[n+1,'main_b'] = myreg[0]
                locusremover.at[n+1,'main_a'] = myreg_main[0]
                locusremover.at[n+1,'main_S'] = myreg_main[1]
                
                myreg_1 = york_slope1(my_x,my_y,my_x_err,my_y_err,yorkn)
                locusremover.at[n+1,'1_a'] = myreg_1[0]
                locusremover.at[n+1,'1_S'] = myreg_1[1]
                
                locusremover.at[n+1,'N'] = len(templ)
                
                print(abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum())

                if n >= ncycthresh:
                    if (abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum()) > ethresh:
                        break
 
            locusremover.at[1:,'term added'] = locusremover.loc[1:,'term added'].astype(int).astype(str).str.zfill(10)
            locusremover.insert(0,'main locus', floci[l])
            locusremover.insert(0,'envt',envts[e])
            locusremover.insert(0,'ploidy',ploidies[p])
            
            megaremoverv2 = megaremoverv2.append(locusremover)
            print(floci[l])

mystop = time.perf_counter()
elapsed = mystop-mystart
print(str(elapsed)+' seconds to run')
                
export_csv = megaremoverv2.to_csv(r'20210712_megaremoverv2_haphom.csv',index=True,header=True) 

In [None]:
## Analyze the megaremoverv2
mr2 = pd.read_csv('20210712_megaremoverv2_haphom.csv')
mr2 = mr2.drop(columns=['Unnamed: 0'])

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2)):
    if mr2.loc[i,'main_b'] <= 1:
        mr2.at[i,'main_b_-1to1'] = mr2.loc[i,'main_b']
    else:
        mr2.at[i,'main_b_-1to1'] = 1/mr2.loc[i,'main_b']
    
for i in np.arange(len(mr2)):
    if mr2.loc[i,'inferred_b'] <= 1:
        mr2.at[i,'inferred_b_-1to1'] = mr2.loc[i,'inferred_b']
    else:
        mr2.at[i,'inferred_b_-1to1'] = 1/mr2.loc[i,'inferred_b']

# Take ratios of the S values
mr2['main/inferred'] = mr2['main_S']/mr2['inferred_S']
mr2['1/inferred'] = mr2['1_S']/mr2['inferred_S']
mr2['1/main'] = mr2['1_S']/mr2['main_S']

# For each ploidy-envt-locus, plot ratios
# Start by doing just those below the FCT threshold b, can toggle

fct_thresh = 0.9

for p in np.arange(2):
    for e in np.arange(len(envts)):
        tempfloci = []
        for l in np.arange(len(floci)):
            mr2sub = mr2.loc[(mr2['ploidy'] == ploidies[p])&(mr2['envt'] == envts[e])&(mr2['main locus'] == floci[l])].copy(deep=True).reset_index(drop=True)
            if mr2sub.loc[0,'main_b_-1to1'] <= fct_thresh:
                tempfloci = tempfloci + [floci[l]]
                plt.plot(mr2sub['num term add'],mr2sub['1/main'])
                #plt.plot(mr2sub['num term add'],mr2sub['main/inferred'])
                #plt.plot(mr2sub['num term add'],mr2sub['1/inferred'])
                #plt.title(ploidies[p]+'-'+envts[e]+'_'+floci[l])
                #plt.axhline(y=1,color='xkcd:grey',zorder=0)
                #plt.show()
        plt.title(ploidies[p]+'-'+envts[e])
        plt.axhline(y=1,color='xkcd:grey',zorder=0)
        plt.legend(tempfloci)
        plt.show()




In [None]:
# Make the above plots as a subplots figure for the supplement, 2 x 6

fig,axes = plt.subplots(nrows=6, ncols=2,sharex=True,sharey=False,figsize=(6,9),constrained_layout=False)

rows = envts
cols = ['Haploid','Homozygous']

for ax, col in zip(axes[0], cols):
    ax.set_title(col, size=8,fontweight='bold')

for ax, row in zip(axes[:,0], rows):
    ax.set_ylabel(row, rotation=90, size=8,fontweight='bold')
fig.align_ylabels()

fig.text(0.4,0.08,'# terms removed (by rank)',size=8,fontweight='bold')

fig.text(0, 0.5, 'Relative fit ratio, SSE$_{b=1}$ / SSE$_{b=global}$', va='center', rotation='vertical',size=8, fontweight = 'bold')

mr2 = pd.read_csv('20210712_megaremoverv2_haphom.csv')
mr2 = mr2.drop(columns=['Unnamed: 0'])

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2)):
    if mr2.loc[i,'main_b'] <= 1:
        mr2.at[i,'main_b_-1to1'] = mr2.loc[i,'main_b']
    else:
        mr2.at[i,'main_b_-1to1'] = 1/mr2.loc[i,'main_b']
    
for i in np.arange(len(mr2)):
    if mr2.loc[i,'inferred_b'] <= 1:
        mr2.at[i,'inferred_b_-1to1'] = mr2.loc[i,'inferred_b']
    else:
        mr2.at[i,'inferred_b_-1to1'] = 1/mr2.loc[i,'inferred_b']

# Take ratios of the S values
mr2['main/inferred'] = mr2['main_S']/mr2['inferred_S']
mr2['1/inferred'] = mr2['1_S']/mr2['inferred_S']
mr2['1/main'] = mr2['1_S']/mr2['main_S']

# For each ploidy-envt-locus, plot ratios
# Start by doing just those below the FCT threshold b, can toggle

fct_thresh = 1

for p in np.arange(2):
    for e in np.arange(len(envts)):
        tempfloci = []
        for l in np.arange(len(floci)):
            mr2sub = mr2.loc[(mr2['ploidy'] == ploidies[p])&(mr2['envt'] == envts[e])&(mr2['main locus'] == floci[l])&(mr2['num term add'] <= 10)].copy(deep=True).reset_index(drop=True)
            if mr2sub.loc[0,'main_b_-1to1'] <= fct_thresh:
                tempfloci = tempfloci + [floci[l]]
                axes[e][p].plot(mr2sub['num term add'],mr2sub['1/main'],alpha=0.5)
        axes[e][p].set_xlim(-0.5,10.5)
        axes[e][p].axhline(y=1,color='xkcd:grey',zorder=0,lw=0.5)
leg = axes[0][1].legend(tempfloci,title='Locus',loc='upper left',ncol=5,bbox_to_anchor=(-0.95,1.8))
#leg._legend_box.align = "left"
fig.savefig('msfigs/Fig3/removalbype_v01.pdf',bbox_inches='tight')
plt.show()


In [None]:
# Create the same supplementary table as above, except extend to removing all terms (like in fig 3E)

fig,axes = plt.subplots(nrows=6, ncols=2,sharex=True,sharey=False,figsize=(6,9),constrained_layout=False)

rows = envts
cols = ['Haploid','Homozygous']

for ax, col in zip(axes[0], cols):
    ax.set_title(col, size=8,fontweight='bold')

for ax, row in zip(axes[:,0], rows):
    ax.set_ylabel(row, rotation=90, size=8,fontweight='bold')
fig.align_ylabels()

fig.text(0.4,0.08,'# terms removed (by rank)',size=8,fontweight='bold')

fig.text(0, 0.5, 'Relative fit ratio, SSE$_{b=1}$ / SSE$_{b=global}$', va='center', rotation='vertical',size=8, fontweight = 'bold')

mr2 = pd.read_csv('20210712_megaremoverv2_haphom.csv')
mr2 = mr2.drop(columns=['Unnamed: 0'])

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2)):
    if mr2.loc[i,'main_b'] <= 1:
        mr2.at[i,'main_b_-1to1'] = mr2.loc[i,'main_b']
    else:
        mr2.at[i,'main_b_-1to1'] = 1/mr2.loc[i,'main_b']
    
for i in np.arange(len(mr2)):
    if mr2.loc[i,'inferred_b'] <= 1:
        mr2.at[i,'inferred_b_-1to1'] = mr2.loc[i,'inferred_b']
    else:
        mr2.at[i,'inferred_b_-1to1'] = 1/mr2.loc[i,'inferred_b']

# Take ratios of the S values
mr2['main/inferred'] = mr2['main_S']/mr2['inferred_S']
mr2['1/inferred'] = mr2['1_S']/mr2['inferred_S']
mr2['1/main'] = mr2['1_S']/mr2['main_S']

# For each ploidy-envt-locus, plot ratios
# Start by doing just those below the FCT threshold b, can toggle

fct_thresh = 1

for p in np.arange(2):
    for e in np.arange(len(envts)):
        tempfloci = []
        for l in np.arange(len(floci)):
            mr2sub = mr2.loc[(mr2['ploidy'] == ploidies[p])&(mr2['envt'] == envts[e])&(mr2['main locus'] == floci[l])&(mr2['num term add'] <= 10)].copy(deep=True).reset_index(drop=True)
            if mr2sub.loc[0,'main_b_-1to1'] <= fct_thresh:
                tempfloci = tempfloci + [floci[l]]
                axes[e][p].plot(mr2sub['num term add'],mr2sub['1/main'],alpha=0.5)
        axes[e][p].set_xlim(-0.5,10.5)
        axes[e][p].axhline(y=1,color='xkcd:grey',zorder=0,lw=0.5)
leg = axes[0][1].legend(tempfloci,title='Locus',loc='upper left',ncol=5,bbox_to_anchor=(-0.95,1.8))
#leg._legend_box.align = "left"
fig.savefig('msfigs/Fig3/removalbype_v01.pdf',bbox_inches='tight')
plt.show()


In [None]:
# Create the same supplementary table as above, except extend to removing all terms (like in fig 3E)

# Import the "all removed" and normal megaremoverv2 tables
mr2 = pd.read_csv('20210712_megaremoverv2_haphom.csv')
mr2 = mr2.drop(columns=['Unnamed: 0'])

mr2all = pd.read_csv('20210712_megaremoverv2_haphom_ALLTERMSREMOVED.csv')
mr2all = mr2all.drop(columns=['Unnamed: 0'])
mr2all = mr2all.loc[(mr2all['coefficient'] == 'all')].reset_index(drop=True)


# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2)):
    if mr2.loc[i,'main_b'] <= 1:
        mr2.at[i,'main_b_-1to1'] = mr2.loc[i,'main_b']
    else:
        mr2.at[i,'main_b_-1to1'] = 1/mr2.loc[i,'main_b']
    
for i in np.arange(len(mr2)):
    if mr2.loc[i,'inferred_b'] <= 1:
        mr2.at[i,'inferred_b_-1to1'] = mr2.loc[i,'inferred_b']
    else:
        mr2.at[i,'inferred_b_-1to1'] = 1/mr2.loc[i,'inferred_b']

# Take ratios of the S values
mr2['main/inferred'] = mr2['main_S']/mr2['inferred_S']
mr2['1/inferred'] = mr2['1_S']/mr2['inferred_S']
mr2['1/main'] = mr2['1_S']/mr2['main_S']

# Do the same for the mr2all table
for i in np.arange(len(mr2all)):
    if mr2all.loc[i,'main_b'] <= 1:
        mr2all.at[i,'main_b_-1to1'] = mr2all.loc[i,'main_b']
    else:
        mr2all.at[i,'main_b_-1to1'] = 1/mr2all.loc[i,'main_b']
    
for i in np.arange(len(mr2all)):
    if mr2all.loc[i,'inferred_b'] <= 1:
        mr2all.at[i,'inferred_b_-1to1'] = mr2all.loc[i,'inferred_b']
    else:
        mr2all.at[i,'inferred_b_-1to1'] = 1/mr2all.loc[i,'inferred_b']

# Take ratios of the S values
mr2all['main/inferred'] = mr2all['main_S']/mr2all['inferred_S']
mr2all['1/inferred'] = mr2all['1_S']/mr2all['inferred_S']
mr2all['1/main'] = mr2all['1_S']/mr2all['main_S']

fig,axes = plt.subplots(nrows=6, ncols=2,sharex=True,sharey=False,figsize=(6,9),constrained_layout=False)

rows = envts
cols = ['Haploid','Homozygous']

for ax, col in zip(axes[0], cols):
    ax.set_title(col, size=8,fontweight='bold')

for ax, row in zip(axes[:,0], rows):
    ax.set_ylabel(row, rotation=90, size=8,fontweight='bold')
fig.align_ylabels()

fig.text(0.4,0.08,'# terms removed (by rank)',size=8,fontweight='bold')

fig.text(0, 0.5, 'Relative fit ratio, SSE$_{b=1}$ / SSE$_{b=global}$', va='center', rotation='vertical',size=8, fontweight = 'bold')

# For each ploidy-envt-locus, plot ratios
# Start by doing just those below the FCT threshold b, can toggle

fct_thresh = 1

for p in np.arange(2):
    for e in np.arange(len(envts)):
        tempfloci = []
        for l in np.arange(len(floci)):
            mr2sub = mr2.loc[(mr2['ploidy'] == ploidies[p])&(mr2['envt'] == envts[e])&(mr2['main locus'] == floci[l])&(mr2['num term add'] <= 10)].copy(deep=True).reset_index(drop=True)
            if mr2sub.loc[0,'main_b_-1to1'] <= fct_thresh:
                tempfloci = tempfloci + [floci[l]]
                termlist = list(mr2sub['num term add'])
                termlist = termlist + [float(11)]
                ratlist = list(mr2sub['1/main'])
                tempall = mr2all.loc[(mr2all['ploidy'] == ploidies[p])&(mr2all['envt'] == envts[e])&(mr2all['main locus'] == floci[l])].copy(deep=True).reset_index(drop=True)
                ratlist = ratlist + [tempall.loc[0,'1/main']]
                
                axes[e][p].plot(termlist,ratlist,alpha=0.5)
        axes[e][p].set_xlim(-0.5,11.5)
        axes[e][p].axhline(y=1,color='xkcd:grey',zorder=0,lw=0.5)
        axes[e][p].set_xticks([0,1,2,3,4,5,6,7,8,9,10,11])
        axes[e][p].set_xticklabels([0,1,2,3,4,5,6,7,8,9,10,'all'])
leg = axes[0][1].legend(tempfloci,title='Locus',loc='upper left',ncol=5,bbox_to_anchor=(-0.95,1.8))

#leg._legend_box.align = "left"
fig.savefig('msfigs/Fig3/removalbype_v02.pdf',bbox_inches='tight')
plt.show()





In [None]:
# Compare the final 1/main to the all-sense heritability
# Put it all in a table

# first get the relevant 1/main SSE ratios
detab = pd.DataFrame()
myind = 0
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        for l in np.arange(len(floci)):
            detab.at[myind,'ploidy'] = ploidies[p]
            detab.at[myind,'envt'] = envts[e]
            detab.at[myind,'main locus'] = floci[l]
            
            temp = mr2.loc[(mr2['ploidy'] == ploidies[p])&(mr2['envt'] == envts[e])&(mr2['main locus'] == floci[l])].reset_index(drop=True)
            
            detab.at[myind,'inferred_b_-1to1_original'] = temp.loc[0,'inferred_b_-1to1']
            
            mymaxind = temp.iloc[-1]['num term add']
            
            detab.at[myind,'1/main_final'] = temp.loc[int(mymaxind),'1/main']
            detab.at[myind,'num term add'] = mymaxind
            
            myind = myind + 1

detab['ploidy-envt'] = detab['ploidy']+'-'+detab['envt']

# now figure out how to merge it with the biological replicate data
workingbr = bioreplr.copy(deep=True)
workingbr['ploidy-envt'] = workingbr['ploidy']+'-'+workingbr['envt']

detab = pd.merge(detab,workingbr,on='ploidy-envt',how='left')

# plot the results, locus by locus

for l in np.arange(len(floci)):
    temp = detab.loc[detab['main locus'] == floci[l]]
    plt.scatter(temp['r^2'],temp['1/main_final'])
    myreg = linregress(temp['r^2'],temp['1/main_final'])
    plt.plot(temp['r^2'],temp['r^2']*myreg.slope+myreg.intercept,alpha=0.7,lw=0.5)
    print(floci[l]+': r^2 is '+str(round(myreg.rvalue**2,3)))
plt.legend(floci)
plt.xlabel('r^2, biological replicates (for ploidy-envt)')
plt.ylabel('SSE b=1 / SSE b=global ("final")')
plt.show()

# now do it for only those slopes which we consider to be FCTs (threshold of 0.9)
# also limit it to ones where we have 3 points or more
plottedfloci = []
tempfct = detab.loc[detab['inferred_b_-1to1_original'] <= 0.9]
for l in np.arange(len(floci)):
    temp = tempfct.loc[tempfct['main locus'] == floci[l]]
    if len(temp) > 2:
        plt.scatter(temp['r^2'],temp['1/main_final'])
        myreg = linregress(temp['r^2'],temp['1/main_final'])
        plt.plot(temp['r^2'],temp['r^2']*myreg.slope+myreg.intercept,alpha=0.7,lw=0.5)
        print(floci[l]+': r^2 is '+str(round(myreg.rvalue**2,3)))
        plottedfloci = plottedfloci + [floci[l]]
plt.legend(plottedfloci)
plt.xlabel('r^2, biological replicates (for ploidy-envt)')
plt.ylabel('SSE b=1 / SSE b=global ("final")')
plt.show()

# same as above but without hom-ypda
plottedfloci = []
tempfctr = tempfct.loc[tempfct['ploidy-envt'] != 'hom-YPDA']
for l in np.arange(len(floci)):
    temp = tempfctr.loc[tempfctr['main locus'] == floci[l]]
    if len(temp) > 2:
        plt.scatter(temp['r^2'],temp['1/main_final'])
        myreg = linregress(temp['r^2'],temp['1/main_final'])
        plt.plot(temp['r^2'],temp['r^2']*myreg.slope+myreg.intercept,alpha=0.7,lw=0.5)
        print(floci[l]+': r^2 is '+str(round(myreg.rvalue**2,3)))
        plottedfloci = plottedfloci + [floci[l]]
plt.legend(plottedfloci)
plt.xlabel('r^2, biological replicates (for ploidy-envt)')
plt.ylabel('SSE b=1 / SSE b=global ("final")')
plt.show()

In [None]:
# Just do the FCTs (> 2 points) loci

# Want to update the above so that it has truly the "final" SSE ratio
# Right now, it's just final after removing at most 30 terms, which won't be enough in many cases to get to the residual
# Starting point is mr2all

mr2all = pd.read_csv('20210712_megaremoverv2_haphom_ALLTERMSREMOVED.csv')
mr2all = mr2all.drop(columns=['Unnamed: 0'])
#mr2all = mr2all.loc[(mr2all['coefficient'] == 'all')].reset_index(drop=True)

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria

# Do the same for the mr2all table
for i in np.arange(len(mr2all)):
    if mr2all.loc[i,'main_b'] <= 1:
        mr2all.at[i,'main_b_-1to1'] = mr2all.loc[i,'main_b']
    else:
        mr2all.at[i,'main_b_-1to1'] = 1/mr2all.loc[i,'main_b']
    
for i in np.arange(len(mr2all)):
    if mr2all.loc[i,'inferred_b'] <= 1:
        mr2all.at[i,'inferred_b_-1to1'] = mr2all.loc[i,'inferred_b']
    else:
        mr2all.at[i,'inferred_b_-1to1'] = 1/mr2all.loc[i,'inferred_b']

# Take ratios of the S values
mr2all['main/inferred'] = mr2all['main_S']/mr2all['inferred_S']
mr2all['1/inferred'] = mr2all['1_S']/mr2all['inferred_S']
mr2all['1/main'] = mr2all['1_S']/mr2all['main_S']


detab = pd.DataFrame()
myind = 0
for p in np.arange(len(ploidies)):
    for e in np.arange(len(envts)):
        for l in np.arange(len(floci)):
            detab.at[myind,'ploidy'] = ploidies[p]
            detab.at[myind,'envt'] = envts[e]
            detab.at[myind,'main locus'] = floci[l]
            
            temp = mr2all.loc[(mr2all['ploidy'] == ploidies[p])&(mr2all['envt'] == envts[e])&(mr2all['main locus'] == floci[l])].reset_index(drop=True)
            
            detab.at[myind,'inferred_b_-1to1_original'] = temp.loc[0,'inferred_b_-1to1']
            
            #mymaxind = temp.iloc[-1]['num term add']
            
            detab.at[myind,'1/main_final'] = temp.loc[1,'1/main']
            #detab.at[myind,'num term add'] = 'all'
            
            myind = myind + 1

detab['ploidy-envt'] = detab['ploidy']+'-'+detab['envt']

# now figure out how to merge it with the biological replicate data
workingbr = bioreplr.copy(deep=True)
workingbr['ploidy-envt'] = workingbr['ploidy']+'-'+workingbr['envt']

detab = pd.merge(detab,workingbr,on='ploidy-envt',how='left')

# plot the results, locus by locus
# now do it for only those slopes which we consider to be FCTs (threshold of 0.9)
# also limit it to ones where we have 3 points or more

fig,ax = plt.subplots(figsize=(5.5,3))

plottedfloci = []
fct_thresh = 0.9
tempfct = detab.loc[detab['inferred_b_-1to1_original'] <= fct_thresh]
for l in np.arange(len(floci)):
    temp = tempfct.loc[tempfct['main locus'] == floci[l]]
    if len(temp) > 2:
        ax.scatter(temp['r^2'],temp['1/main_final'],alpha=0.7,s=15)
        #myreg = linregress(temp['r^2'],temp['1/main_final'])
        #ax.plot(temp['r^2'],temp['r^2']*myreg.slope+myreg.intercept,alpha=0.7,lw=0.8)
        ##print(floci[l]+': r^2 is '+str(round(myreg.rvalue**2,3)))
        plottedfloci = plottedfloci + [floci[l]]
#ax.legend(plottedfloci)
leg = ax.legend(plottedfloci,ncol=3,
                handlelength = 1,labelspacing=0.4,columnspacing=1,handletextpad=0.4,frameon=True,
                loc='upper right',bbox_to_anchor=(0.32,0.22),borderpad=0.5,fontsize=6)
leg._legend_box.align = "left"
leg.set_title('Locus')
#leg.get_frame().set_linewidth(0.5)
ax.set_xlabel('Bio rep R$^2$')
ax.set_ylabel('\n'.join(wrap('Final SSE$_{b=1}$ / SSE$_{b=global}$, all epistasis removed',40)),labelpad=2)
ax.axhline(y=1,zorder=0,lw=0.5,color='xkcd:grey')

plt.savefig("msfigs/SIfigs/residual-ratio_r2_justfcts_v2.pdf",bbox_inches='tight',dpi=300)

plt.show()


In [None]:
# Do the megaremoverv2 on JUST the residuals by setting all values in builder to 0

#time the code
mystart = time.perf_counter()

megaremoverv2allremoved = pd.DataFrame()
o=10
ncyc = 30
ncycthresh = 10
#pthresh = 0.01
yorkn = 100
#diffthresh = 0.01
ethresh = 0.50

#for p in np.arange(0,1):
for p in np.arange(len(ploidies)):
    #for e in np.arange(0,1):
    for e in np.arange(len(envts)):
        
        print(ploidies[p]+'_'+envts[e])
        
        # First, import a list of genotypes to predict
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]
        
        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)
        
        # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
        # represented multiple times.
        glist = list(OrderedDict.fromkeys(data1['genotype']))

        data2 = pd.DataFrame()

        for g in np.arange(len(glist)):
            tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
            if len(tempg) == 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
                data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
            elif len(tempg) > 1:
                data2.at[g,'genotype'] = glist[g]
                data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
                #my_svar = statistics.variance(tempg['s'])
                my_svar = 0
                mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
                data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
            data2.at[g,'s_pred'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_Alex prediction']

        data1 = data2
        
        # Calculate the difference between observed and predicted, such that s_obs = s_pred + diff
        data1['opdiff'] = data1['s'] - data1['s_pred']

        # Create a column for each locus
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
            data1.loc[(data1[floci[l]] == 0),floci[l]] = -1
        
        # Now import a set of coefficients
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                                             sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_term','na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        data0['numMut'] = data0[floci].sum(axis=1)
        
        # Add a "baseline" column for whether these terms are added or subtracted in the WT
        for i in np.arange(len(data0)):
            if data0.loc[i,'numMut'] % 2 == 1:
                data0.at[i,'baseline'] = -1
            else:
                data0.at[i,'baseline'] = 1
        
        #for l in np.arange(2,3):
        for l in np.arange(len(floci)):
        
            # Want to start with "all-epistasis" observed fitnesses in data1
            # Get b_obs, TLS_b_obs, TLS_1 (and intercept values)
            tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l]]]

            templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

            for i in np.arange(len(floci)):
                templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)
            
            # York regression to get sum of total least squares deviations, S
            myreg = yorkreg_nocorr(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
            myreg1 = york_slope1(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
            
            # Create a subtable - we'll append these together into megaremoverv2allremoved
            locusremover = pd.DataFrame()
            
            # wrap up the below, haven't edited yet...
            locusremover.at[0,'num term add'] = 0
            locusremover.at[0,'term added'] = 'na'
            locusremover.at[0,'term order'] = 'na'
            locusremover.at[0,'coefficient'] = 'na'
            locusremover.at[0,'inferred_b'] = myreg[0]
            locusremover.at[0,'inferred_a'] = myreg[1]
            locusremover.at[0,'inferred_S'] = myreg[3]
            locusremover.at[0,'main_b'] = myreg[0]
            locusremover.at[0,'main_a'] = myreg[1]
            locusremover.at[0,'main_S'] = myreg[3]
            locusremover.at[0,'N'] = len(templ)
            locusremover.at[0,'1_a'] = myreg1[0]
            locusremover.at[0,'1_S'] = myreg1[1]
            
            #plot it
            plt.errorbar(templ['s_x'],templ['s_y'],xerr=templ['stderr(s)_x'],yerr=templ['stderr(s)_y'],alpha=0.7,linestyle='None')
            mainx = np.linspace(-1,1)
            xs = plt.gca().get_xlim()
            ys = plt.gca().get_ylim()
            plt.plot(mainx,mainx*myreg[0]+myreg[1],color='xkcd:cerulean',zorder=0)
            plt.plot(mainx,mainx*1+myreg1[0],color='xkcd:orange',zorder=0)
            plt.plot(mainx,mainx,color='k',zorder=0)
            plt.xlim(xs)
            plt.ylim(ys)
            plt.title(ploidies[p]+'-'+envts[e]+'_'+floci[l]+': none removed')
            plt.show()
                        
            # OPTION TO CONSIDER: ADD IN ∆S FORMULATIONS, DIVERGENCE FROM SLOPE OF 0
            
            # Begin cycles
            # data0adder serves as the sorted databank for (non-zero) coefficients
            data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[floci[l]] == 1)]
            data0adder['abs coefficient'] = abs(data0adder[ploidies[p]+'_'+envts[e]+'_term'])
            data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)
            
            # create a builder dataframe on which changes will be processed
            builder = data0.copy(deep=True)

            # find the top index (i.e., the strongest coefficient)
            allind = data0adder.index

            # remove all epistatic coefficients that involve locus l
            builder.at[allind,ploidies[p]+'_'+envts[e]+'_term'] = 0

            # add coefficient info in locusadder
            locusremover.at[1,'num term add'] = 'all'
            locusremover.at[1,'term added'] = 'all' 
            locusremover.at[1,'term order'] = 'all'
            locusremover.at[1,'coefficient'] = 'all'

            # Estimate the new predicted value for each genotype
            for g in np.arange(len(data1)):
                temp = builder.copy(deep=True)
                # to make life faster, remove all zero values
                temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                for t in np.arange(len(temp)):
                    temprowlist = []
                    for locus in np.arange(len(floci)):
                        temprowlist = temprowlist + [temp.loc[t,floci[locus]]*data1.loc[g,floci[locus]]]
                    #remove zeros
                    temprowlist = [value for value in temprowlist if value != 0]
                    # find product
                    firstprod = np.prod(temprowlist)
                    # subtract out baseline
                    firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                    # multiply by term's value
                    mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                    temp.at[t,'tosum'] = mytermval
                totalsum = temp['tosum'].sum()
                myintercept = data1.loc[(data1['genotype'] == '0000000000'),'s_pred'].values[0]
                data1.at[g,'new_pred'] = totalsum + myintercept
            data1['new_obs'] = data1['new_pred'] + data1['opdiff']

            # For the new predictions, get the values I want
            tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','new_obs','stderr(s)','without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','new_obs','stderr(s)','without_'+floci[l]]]

            templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

            for i in np.arange(len(floci)):
                templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

            my_x = templ['new_obs_x']
            my_y = templ['new_obs_y']
            my_x_err = templ['stderr(s)_x']
            my_y_err = templ['stderr(s)_y']

            myregnew = yorkreg_nocorr(my_x,my_y,my_x_err,my_y_err,yorkn)
            locusremover.at[1,'inferred_b'] = myregnew[0]
            locusremover.at[1,'inferred_a'] = myregnew[1]
            locusremover.at[1,'inferred_S'] = myregnew[3]

            myreg_main = york_slopeanyb(my_x,my_y,my_x_err,my_y_err,yorkn,myreg[0])
            locusremover.at[1,'main_b'] = myreg[0]
            locusremover.at[1,'main_a'] = myreg_main[0]
            locusremover.at[1,'main_S'] = myreg_main[1]

            myreg_1 = york_slope1(my_x,my_y,my_x_err,my_y_err,yorkn)
            locusremover.at[1,'1_a'] = myreg_1[0]
            locusremover.at[1,'1_S'] = myreg_1[1]

            locusremover.at[1,'N'] = len(templ)

            #print(abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum())
            
            #plot it
            plt.errorbar(my_x,my_y,xerr=my_x_err,yerr=my_y_err,alpha=0.7,linestyle='None')
            mainx = np.linspace(-1,1)
            xs = plt.gca().get_xlim()
            ys = plt.gca().get_ylim()
            plt.plot(mainx,mainx*myreg[0]+myreg_main[0],color='xkcd:cerulean',zorder=0)
            plt.plot(mainx,mainx*1+myreg_1[0],color='xkcd:orange',zorder=0)
            plt.plot(mainx,mainx*myregnew[0]+myregnew[1],color='xkcd:gold',zorder=0)
            plt.plot(mainx,mainx,color='k',zorder=0)
            plt.xlim(xs)
            plt.ylim(ys)
            plt.title(ploidies[p]+'-'+envts[e]+'_'+floci[l]+': all removed')
            plt.show()

            locusremover.insert(0,'main locus', floci[l])
            locusremover.insert(0,'envt',envts[e])
            locusremover.insert(0,'ploidy',ploidies[p])

            megaremoverv2allremoved = megaremoverv2allremoved.append(locusremover)
            print(floci[l])

mystop = time.perf_counter()
elapsed = mystop-mystart
print(str(elapsed)+' seconds to run')
                
export_csv = megaremoverv2allremoved.to_csv(r'20210712_megaremoverv2_haphom_ALLTERMSREMOVED.csv',index=True,header=True)
            
            


In [68]:
# Goal for now is to get a table, then can figure out a bit how to present best

# Import the "all removed" and normal megaremoverv2 tables
mr2 = pd.read_csv('20210712_megaremoverv2_haphom.csv')
mr2 = mr2.drop(columns=['Unnamed: 0'])

mr2all = pd.read_csv('20210712_megaremoverv2_haphom_ALLTERMSREMOVED.csv')
mr2all = mr2all.drop(columns=['Unnamed: 0'])
mr2all = mr2all.loc[(mr2all['coefficient'] == 'all')].reset_index(drop=True)


# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2)):
    if mr2.loc[i,'main_b'] <= 1:
        mr2.at[i,'main_b_-1to1'] = mr2.loc[i,'main_b']
    else:
        mr2.at[i,'main_b_-1to1'] = 1/mr2.loc[i,'main_b']
    
for i in np.arange(len(mr2)):
    if mr2.loc[i,'inferred_b'] <= 1:
        mr2.at[i,'inferred_b_-1to1'] = mr2.loc[i,'inferred_b']
    else:
        mr2.at[i,'inferred_b_-1to1'] = 1/mr2.loc[i,'inferred_b']

# Take ratios of the S values
mr2['main/inferred'] = mr2['main_S']/mr2['inferred_S']
mr2['1/inferred'] = mr2['1_S']/mr2['inferred_S']
mr2['1/main'] = mr2['1_S']/mr2['main_S']

# Do the same for the mr2all table
for i in np.arange(len(mr2all)):
    if mr2all.loc[i,'main_b'] <= 1:
        mr2all.at[i,'main_b_-1to1'] = mr2all.loc[i,'main_b']
    else:
        mr2all.at[i,'main_b_-1to1'] = 1/mr2all.loc[i,'main_b']
    
for i in np.arange(len(mr2all)):
    if mr2all.loc[i,'inferred_b'] <= 1:
        mr2all.at[i,'inferred_b_-1to1'] = mr2all.loc[i,'inferred_b']
    else:
        mr2all.at[i,'inferred_b_-1to1'] = 1/mr2all.loc[i,'inferred_b']

# Take ratios of the S values
mr2all['main/inferred'] = mr2all['main_S']/mr2all['inferred_S']
mr2all['1/inferred'] = mr2all['1_S']/mr2all['inferred_S']
mr2all['1/main'] = mr2all['1_S']/mr2all['main_S']

# count stuff, getting a proportion
mr2rep = pd.DataFrame()

fct_thresh = 0.9

for n in np.arange(1,11):
    mr2rep.at[0,str(n)+'_denom'] = len(mr2.loc[(mr2['num term add'] == n)])
    mr2rep.at[0,str(n)+'_num1better'] = len(mr2.loc[(mr2['num term add'] == n)&(mr2['1/main'] < 1)])
    mr2rep.at[1,str(n)+'_denom'] = len(mr2.loc[(mr2['num term add'] == n)&(mr2['main_b_-1to1'] <= fct_thresh)])
    mr2rep.at[1,str(n)+'_num1better'] = len(mr2.loc[(mr2['num term add'] == n)&(mr2['1/main'] < 1)&(mr2['main_b_-1to1'] <= fct_thresh)])
    mr2rep.at[2,str(n)+'_denom'] = len(mr2.loc[(mr2['num term add'] == n)&(mr2['main_b_-1to1'] > fct_thresh)])
    mr2rep.at[2,str(n)+'_num1better'] = len(mr2.loc[(mr2['num term add'] == n)&(mr2['1/main'] < 1)&(mr2['main_b_-1to1'] > fct_thresh)])
    mr2rep[str(n)+'_frac'] = mr2rep[str(n)+'_num1better'] / mr2rep[str(n)+'_denom']

mr2rep.at[0,'allremoved_denom'] = len(mr2all)
mr2rep.at[0,'allremoved_num1better'] = len(mr2all.loc[(mr2all['1/main'] < 1)])
mr2rep.at[1,'allremoved_denom'] = len(mr2all.loc[(mr2all['main_b_-1to1'] <= fct_thresh)])
mr2rep.at[1,'allremoved_num1better'] = len(mr2all.loc[(mr2all['1/main'] < 1)&(mr2all['main_b_-1to1'] <= fct_thresh)])
mr2rep.at[2,'allremoved_denom'] = len(mr2all.loc[(mr2all['main_b_-1to1'] > fct_thresh)])
mr2rep.at[2,'allremoved_num1better'] = len(mr2all.loc[(mr2['1/main'] < 1)&(mr2all['main_b_-1to1'] > fct_thresh)])
mr2rep['allremoved_frac'] = mr2rep['allremoved_num1better'] / mr2rep['allremoved_denom']

mr2rep.insert(0,'category',['all','<='+str(fct_thresh),'>'+str(fct_thresh)])
    
        


In [None]:

thresh_list = [1.0,0.9,0.8]
color_list = ['xkcd:cerulean','xkcd:orange','xkcd:wine red']

offsets = [-0.1,0,0.1]

fig,ax = plt.subplots(nrows=1, ncols=1, figsize=(4*0.65,2*0.58))

for t in np.arange(len(thresh_list)):
    #boxvals = []
    medlist = []
    uplist = []
    downlist = []
    for n in np.arange(11):
        temp = mr2.loc[(mr2['num term add'] == n)&(mr2['main_b_-1to1'] <= thresh_list[t])].copy(deep=True)
        my_median = np.median(temp['1/main'])
        medlist = medlist + [my_median]
        my_25 = np.quantile(temp['1/main'],0.25)
        lower_error = my_median - my_25 
        downlist = downlist + [lower_error]
        my_75 = np.quantile(temp['1/main'],0.75)
        upper_error = my_75 - my_median
        uplist = uplist + [upper_error]
        #boxvals = boxvals + [temp['1/main']]
        #plt.boxplot(temp['1/main'])
    #plt.boxplot(boxvals)
    temp = mr2all.loc[mr2all['main_b_-1to1'] <= thresh_list[t]].copy(deep=True)
    medlist = medlist + [np.median(temp['1/main'])]
    my_25 = np.quantile(temp['1/main'],0.25)
    lower_error = my_median - my_25 
    downlist = downlist + [lower_error]
    my_75 = np.quantile(temp['1/main'],0.75)
    upper_error = my_75 - my_median
    uplist = uplist + [upper_error]
    
    my_error = [downlist,uplist]
    #plt.errorbar(np.arange(11),medlist,yerr=my_error,fmt='o')
    markers,caps,bars = ax.errorbar(np.arange(12)+offsets[t],medlist,yerr=my_error,elinewidth=0.8,alpha=0.7,color=color_list[t])#,capsize=10)
    [bar.set_alpha(0.7) for bar in bars]
    #ax.plot(np.arange(1,11),medlist)

#leg = ax.legend(['all','≤ 0.9','≤ 0.8'],title='Global $b$',ncol=1,
#                handlelength = 1,labelspacing=0.4,columnspacing=1,handletextpad=0.4,frameon=False,
#                loc='upper right',bbox_to_anchor=(1.03,1.06))
leg = ax.legend(['all','≤ 0.9','≤ 0.8'],ncol=3,
                handlelength = 1,labelspacing=0.4,columnspacing=1,handletextpad=0.4,frameon=True,
                loc='upper right',bbox_to_anchor=(1.0,1),borderpad=0.3,fontsize=6)
leg._legend_box.align = "left"
leg.set_title('Global $b$',prop={'size':6})
leg.get_frame().set_linewidth(0.5)

ax.set_xlabel('# terms removed (by rank)',labelpad=2)
ax.set_ylabel('\n'.join(wrap('Relative fit ratio, SSE$_{b=1}$ / SSE$_{b=global}$',30)),labelpad=2)

ax.axhline(y=1,lw=0.5,c='xkcd:grey',zorder=0)

ax.set_xticks([0,1,2,3,4,5,6,7,8,9,10,11])
ax.set_xticklabels([0,1,2,3,4,5,6,7,8,9,10,'all'])
# create some more white space vertically
ax.set_ylim(0.3,1.9)

# add text in the white space
ax.text(-0.34,0.35,'$b = 1$ favored',fontsize=6,color='k')
ax.text(-0.34,1.74,'Global $b$ favored',fontsize=6)

# Apply shading to above and below zero
xmin = ax.get_xlim()[0]
xmax = ax.get_xlim()[1]
ymin = ax.get_ylim()[0]
ymax = ax.get_ylim()[1]
ax.autoscale(False)
ax.fill([xmin,xmin,xmax,xmax], [ymin,1,1,ymin], color='xkcd:light grey', alpha=0.5, edgecolor=None, zorder=0)


fig.savefig('msfigs/Fig3/removal_v02.pdf',bbox_inches='tight')
plt.show()


In [None]:
# Make a subplots version of the above for the SI
fig,axes = plt.subplots(nrows=3, ncols=1,sharex=True,sharey=False,figsize=(5*0.65,6*0.58),constrained_layout=True)

thresh_list = [1.0,0.9,0.8]

offsets = [-0.1,0,0.1]

rlist = ['1/inferred','main/inferred']

for t in np.arange(len(thresh_list)):
    for r in np.arange(len(rlist)):
        medlist = []
        uplist = []
        downlist = []
        for n in np.arange(11):
            temp = mr2.loc[(mr2['num term add'] == n)&(mr2['main_b_-1to1'] <= thresh_list[t])].copy(deep=True)
            my_median = np.median(temp[rlist[r]])
            medlist = medlist + [my_median]
            my_25 = np.quantile(temp[rlist[r]],0.25)
            lower_error = my_median - my_25 
            downlist = downlist + [lower_error]
            my_75 = np.quantile(temp[rlist[r]],0.75)
            upper_error = my_75 - my_median
            uplist = uplist + [upper_error]
        temp = mr2all.loc[mr2all['main_b_-1to1'] <= thresh_list[t]].copy(deep=True)
        medlist = medlist + [np.median(temp[rlist[r]])]
        my_25 = np.quantile(temp[rlist[r]],0.25)
        lower_error = my_median - my_25 
        downlist = downlist + [lower_error]
        my_75 = np.quantile(temp[rlist[r]],0.75)
        upper_error = my_75 - my_median
        uplist = uplist + [upper_error]

        my_error = [downlist,uplist]
        markers,caps,bars = axes[t].errorbar(np.arange(12)+offsets[r],medlist,yerr=my_error,elinewidth=0.8,alpha=0.7)#,capsize=10)
        [bar.set_alpha(0.7) for bar in bars]
        if t == 0:
            axes[t].legend(['SSE$_{b=1}$ / SSE$_{min}$','SSE$_{b=global}$ / SSE$_{min}$'],loc='upper right',bbox_to_anchor=(1.7,1))
    axes[t].axhline(y=1,lw=0.5,c='xkcd:grey',zorder=0)
    axes[t].text(0.03,0.87,'b ≤ '+str(thresh_list[t]),transform=axes[t].transAxes,
                                                ha='left',fontsize=7,color='k')
        
axes[2].set_xlabel('# terms removed (by rank)')
#axes[t].set_ylabel('SSE ratio')
axes[2].set_xticks([0,1,2,3,4,5,6,7,8,9,10,11])
axes[2].set_xticklabels([0,1,2,3,4,5,6,7,8,9,10,'all'])

fig.text(-0.03, 0.5, 'SSE ratio', va='center', rotation='vertical',size=8, fontweight = 'bold')

fig.savefig('msfigs/Fig3/removal_2rats_v02.pdf',bbox_inches='tight')
plt.show()


## PMA1 adding analysis

In [None]:
# Get a figure of adding terms one by one to PMA1 hap_4NQO prediction
# First generate the data

# Want to do ABC vs aBC plots now where, instead of subtracting terms by rank,
# we add them by rank.

#time the code
#mystart = time.perf_counter()

#megaadder = pd.DataFrame()
o=10
ncyc = 30
ncycthresh = 2
#pthresh = 0.01
yorkn = 100
diffthresh = 0.01

for p in np.arange(0,1):
#for p in np.arange(len(ploidies)):
    for e in np.arange(1,2):
    #for e in np.arange(len(envts)):
        
        #print(ploidies[p]+'_'+envts[e])
        
        # First, import a list of genotypes to predict
        data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                             sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

        # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
        data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

        # Binary style for genotype
        data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

        # Since we're just using the predicted s, remove the obs columns
        data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)

        # Create a column for each locus
        for l in np.arange(len(floci)):
            data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
            data1.loc[(data1[floci[l]] == 0),floci[l]] = -1
        
        # Now import a set of coefficients
        data0 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                                             sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_term','na'],skiprows=2,skip_blank_lines=False)

        data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

        data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(10)

        data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

        for l in np.arange(len(floci)):
            data0[floci[l]] = data0.loc[:,'genotype'].str[l].astype(int)
            data1['without_'+floci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

        data0['numMut'] = data0[floci].sum(axis=1)
        
        # Add a "baseline" column for whether these terms are added or subtracted in the WT
        for i in np.arange(len(data0)):
            if data0.loc[i,'numMut'] % 2 == 1:
                data0.at[i,'baseline'] = -1
            else:
                data0.at[i,'baseline'] = 1

        for l in np.arange(4,5):
        #for l in np.arange(len(floci)):
        
            # Want to start with "no-epistasis" predictions in data1
            # builder is where we build up coefficients for fitness predictions
            # Start by setting all epistatic terms INVOLVING THE FOCAL LOCUS to 0

            builder = data0.copy(deep=True)
            builder.at[(builder['numMut'] > 1)&(builder[floci[l]] == 1),ploidies[p]+'_'+envts[e]+'_term'] = 0

            for g in np.arange(len(data1)):
                temp = builder.copy(deep=True)
                # to make life faster, remove all zero values
                temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                for t in np.arange(len(temp)):
                    temprowlist = []
                    for locus in np.arange(len(floci)):
                        temprowlist = temprowlist + [temp.loc[t,floci[locus]]*data1.loc[g,floci[locus]]]
                    #remove zeros
                    temprowlist = [value for value in temprowlist if value != 0]
                    # find product
                    firstprod = np.prod(temprowlist)
                    # subtract out baseline
                    firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                    # multiply by term's value
                    mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                    temp.at[t,'tosum'] = mytermval
                totalsum = temp['tosum'].sum()
                myintercept = data1.loc[(data1['genotype'] == '0000000000'),ploidies[p]+'_'+envts[e]+'_'+'Alex prediction'].values[0]
                data1.at[g,'new_pred'] = totalsum + myintercept
            
            locusadder = pd.DataFrame()
            locusadder_full = pd.DataFrame()
            
            tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
            tab0 = tab0[['genotype','new_pred','without_'+floci[l]]]
            tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
            tab1 = tab1[['genotype','new_pred','without_'+floci[l]]]

            templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

            for i in np.arange(len(floci)):
                templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)
                
            templ['num term add'] = 0
            locusadder_full = locusadder_full.append(templ,sort=False)
                
            # Get the 1:1 R^2 value
            my_x = templ['new_pred_x']
            my_y = templ['new_pred_y']
            #my_y_mean = np.mean(my_y)
            #my_x_mean = np.mean(my_x)
            #TSS = np.sum((my_y - my_y_mean)**2)
            #RSS = np.sum((my_y - my_y_mean - my_x + my_x_mean)**2)
            
            locusadder.at[0,'num term add'] = 0
            locusadder.at[0,'term added'] = np.nan
            locusadder.at[0,'term order'] = np.nan
            locusadder.at[0,'coefficient'] = np.nan
            #locusadder.at[0,'1:1 R2'] = 1-RSS/TSS
            
            # Perform standard least-squares linear regression
            myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
            locusadder.at[0,'slope'] = myreg[0]
            locusadder.at[0,'intercept'] = myreg[1]      
            
            # Begin cycles
            data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[floci[l]] == 1)]
            data0adder['abs coefficient'] = abs(data0adder[ploidies[p]+'_'+envts[e]+'_term'])
            data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)
            
            for n in np.arange(5):
            #for n in np.arange(ncyc):
                # add the strongest epistatic coefficient that involves locus l
                topind = data0adder.index[0]
                builder.at[topind,ploidies[p]+'_'+envts[e]+'_term'] = data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term']
                
                # log coefficient info in locusadder
                locusadder.at[n+1,'num term add'] = n+1
                locusadder.at[n+1,'term added'] = data0adder.loc[topind,'genotype'] 
                locusadder.at[n+1,'term order'] = data0adder.loc[topind,'numMut']
                locusadder.at[n+1,'coefficient'] = data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term']
                
                # Estimate the new predicted value for each genotype
                for g in np.arange(len(data1)):
                    temp = builder.copy(deep=True)
                    # to make life faster, remove all zero values
                    temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                    for t in np.arange(len(temp)):
                        temprowlist = []
                        for locus in np.arange(len(floci)):
                            temprowlist = temprowlist + [temp.loc[t,floci[locus]]*data1.loc[g,floci[locus]]]
                        #remove zeros
                        temprowlist = [value for value in temprowlist if value != 0]
                        # find product
                        firstprod = np.prod(temprowlist)
                        # subtract out baseline
                        firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                        # multiply by term's value
                        mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                        temp.at[t,'tosum'] = mytermval
                    totalsum = temp['tosum'].sum()
                    myintercept = data1.loc[(data1['genotype'] == '0000000000'),ploidies[p]+'_'+envts[e]+'_'+'Alex prediction'].values[0]
                    data1.at[g,'new_pred'] = totalsum + myintercept
                    
                # For the new predictions, get the values I want
                tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
                tab0 = tab0[['genotype','new_pred','without_'+floci[l]]]
                tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
                tab1 = tab1[['genotype','new_pred','without_'+floci[l]]]

                templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])
                
                for i in np.arange(len(floci)):
                    templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)
                    
                templ['num term add'] = n+1
                locusadder_full = locusadder_full.append(templ,sort=False)

                # Get the 1:1 R^2 value
                my_x = templ['new_pred_x']
                my_y = templ['new_pred_y']

                # Perform standard least-squares linear regression
                myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
                locusadder.at[n+1,'slope'] = myreg[0]
                locusadder.at[n+1,'intercept'] = myreg[1]
                
                # Remove strongest epistatic coefficient from data0adder
                data0adder = data0adder[1:]               
                
                if n >= ncycthresh:
                    diff3 = abs(locusadder.loc[n+1,'slope'] - locusadder.loc[n-2,'slope'])
                    diff2 = abs(locusadder.loc[n+1,'slope'] - locusadder.loc[n-1,'slope'])
                    diff1 = abs(locusadder.loc[n+1,'slope'] - locusadder.loc[n,'slope'])   
                    
                    if diff3 <= diffthresh and diff2 <= diffthresh and diff1 <= diffthresh:
                        break
 
            locusadder.at[1:,'term added'] = locusadder.loc[1:,'term added'].astype(int).astype(str).str.zfill(10)
            locusadder.insert(0,'main locus', floci[l])
            locusadder.insert(0,'envt',envts[e])
            locusadder.insert(0,'ploidy',ploidies[p])

#mystop = time.perf_counter()
#elapsed = mystop-mystart
#print(str(elapsed)+' seconds to run')

In [None]:
# Same as immediately above except 2 rows

colorsnow = ['xkcd:cerulean','xkcd:orange','xkcd:bright green','xkcd:indigo','xkcd:pink','xkcd:rust','xkcd:gold','xkcd:bright aqua']
markersize = 0.25
myalpha = 0.5
mylw = 0.6
maincolor = 'xkcd:dark grey'
linecolor = 'xkcd:grey'
mainx=np.linspace(-1,1)
bgl = ['WHI2','RHO5','MKT1']
gs = [[0,0,0],[0,0,1],[0,1,0],[0,1,1],[1,0,0],[1,0,1],[1,1,0],[1,1,1]]
yorkn = 100

p = 0
e = 1
l = 4

fig,ax = plt.subplots(nrows=2, ncols=4, sharex=True, sharey=True,figsize=(3.5*0.8,2.05*0.85))
fig.subplots_adjust(wspace=0.2*0.85,hspace=0.65)

#figd,axd = plt.subplots(nrows=1,ncols=8,sharex=True,sharey=True,figsize=(5,1.1),constrained_layout=True)

for g in np.arange(len(gs)):
    my_x = locusadder_full.loc[(locusadder_full['num term add'] == 0)&(locusadder_full[bgl[0]] == gs[g][0])&(locusadder_full[bgl[1]] == gs[g][1])&(locusadder_full[bgl[2]] == gs[g][2]),'new_pred_x']
    my_y = locusadder_full.loc[(locusadder_full['num term add'] == 0)&(locusadder_full[bgl[0]] == gs[g][0])&(locusadder_full[bgl[1]] == gs[g][1])&(locusadder_full[bgl[2]] == gs[g][2]),'new_pred_y']

    ax[0][0].scatter(my_x,my_y,s=markersize,alpha=myalpha,color=colorsnow[g])

xs = (-0.5,0.1)
ys = (-0.5,0.1)

thisslope = locusadder.loc[(locusadder['num term add'] == 0),'slope'].values[0]
thisintercept = locusadder.loc[(locusadder['num term add'] == 0),'intercept'].values[0]
ax[0][0].plot(mainx,mainx*thisslope+thisintercept,color=linecolor,lw=mylw,zorder=0)
ax[0][0].plot(mainx,mainx,color='k',lw=mylw,zorder=0)

ax[0][0].set_xlim(xs)
ax[0][0].set_ylim(ys)

# Generate the full-model plot
data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                     sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

# Since we're just using the predicted s, remove the obs columns
data1 = data1.drop(columns=[ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err']).drop_duplicates('genotype').reset_index(drop=True)

# Create a column for each locus
for l1 in np.arange(len(floci)):
    data1[floci[l1]] = data1.loc[:,'genotype'].str[l1].astype(int)
    data1.loc[(data1[floci[l1]] == 0),floci[l1]] = -1
    data1['without_'+floci[l1]] = data1.loc[:,'genotype'].str[:l1] + data1.loc[:,'genotype'].str[l1+1:]

tab0 = data1.loc[(data1[floci[l]] == -1)].copy(deep=True).reset_index(drop=True)
tab0 = tab0[['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction','without_'+floci[l]]]
tab1 = data1.loc[(data1[floci[l]] == 1)].copy(deep=True).reset_index(drop=True)
tab1 = tab1[['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction','without_'+floci[l]]]

templ = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

for i in np.arange(len(floci)):
    templ[floci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

# Get the 1:1 R^2 value
my_x = templ[ploidies[p]+'_'+envts[e]+'_Alex prediction_x']
my_y = templ[ploidies[p]+'_'+envts[e]+'_Alex prediction_y']
myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)

for g in np.arange(len(gs)):
    my_x = templ.loc[(templ[bgl[0]] == gs[g][0])&(templ[bgl[1]] == gs[g][1])&(templ[bgl[2]] == gs[g][2]),ploidies[p]+'_'+envts[e]+'_Alex prediction_x']
    my_y = templ.loc[(templ[bgl[0]] == gs[g][0])&(templ[bgl[1]] == gs[g][1])&(templ[bgl[2]] == gs[g][2]),ploidies[p]+'_'+envts[e]+'_Alex prediction_y']

    ax[1][2].scatter(my_x,my_y,s=markersize,alpha=myalpha,color=colorsnow[g])

ax[1][2].plot(mainx,mainx*myreg[0]+myreg[1],color=linecolor,lw=mylw,zorder=0)
ax[1][2].plot(mainx,mainx,color='k',lw=mylw,zorder=0)

ax[1][2].set_xlim(xs)
ax[1][2].set_ylim(ys)

nref = [[0,1],[0,2],[0,3],[1,0],[1,1]]
for n in np.arange(5):
    for g in np.arange(len(gs)):
        my_x = locusadder_full.loc[(locusadder_full['num term add'] == n+1)&(locusadder_full[bgl[0]] == gs[g][0])&(locusadder_full[bgl[1]] == gs[g][1])&(locusadder_full[bgl[2]] == gs[g][2]),'new_pred_x']
        my_y = locusadder_full.loc[(locusadder_full['num term add'] == n+1)&(locusadder_full[bgl[0]] == gs[g][0])&(locusadder_full[bgl[1]] == gs[g][1])&(locusadder_full[bgl[2]] == gs[g][2]),'new_pred_y']

        ax[nref[n][0]][nref[n][1]].scatter(my_x,my_y,s=markersize,alpha=myalpha,color=colorsnow[g])
    
    thisslope = locusadder.loc[(locusadder['num term add'] == n+1),'slope'].values[0]
    thisintercept = locusadder.loc[(locusadder['num term add'] == n+1),'intercept'].values[0]
    
    ax[nref[n][0]][nref[n][1]].plot(mainx,mainx*thisslope+thisintercept,color=linecolor,lw=mylw,zorder=0)
    ax[nref[n][0]][nref[n][1]].plot(mainx,mainx,color='k',lw=mylw,zorder=0)

    ax[nref[n][0]][nref[n][1]].set_xlim(xs)
    ax[nref[n][0]][nref[n][1]].set_ylim(ys)

    #axd[n+1].scatter(my_x,my_y-my_x,s=markersize,alpha=myalpha,color=maincolor)
    #myreg = yorkreg_nocorr(my_x,my_y-my_x,[0]*len(my_x),[0]*len(my_y),yorkn)
    #axd[n+1].plot(mainx,mainx*myreg[0]+myreg[1],color=linecolor,lw=mylw,zorder=0)
    #axd[n+1].axhline(y=0,color='grey',lw=0.5)

    #axd[n+1].set_xlim(xsd)
    #axd[n+1].set_ylim(ysd)
    
# Add a panel with the observed data
data1 = pd.read_csv('CRISPR_10xmer_BFA_data/7_var_partition/lasso_v2_fa_'+ploidies[p]+'_'+envts[e]+'_'+str(o)+'.txt',
                     sep='\t',names=['genotype','todel',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

# Since we're just using the observed s, remove the pred columns
data1 = data1.drop(columns=['todel']).reset_index(drop=True)

# Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
# represented multiple times.
glist = list(OrderedDict.fromkeys(data1['genotype']))

data2 = pd.DataFrame()

for g in np.arange(len(glist)):
    tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
    if len(tempg) == 1:
        data2.at[g,'genotype'] = glist[g]
        data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
        data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
    elif len(tempg) > 1:
        data2.at[g,'genotype'] = glist[g]
        data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
        my_svar = statistics.variance(tempg[ploidies[p]+'_'+envts[e]+'_s-obs'])
        my_svar = 0
        mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
        data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
        #data2.at[g,'stderr(s)'] = np.sqrt(np.sum(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2))/len(tempg)

data1 = data2

# Create a column for each locus, and for the genotype with that locus removed
for l1 in np.arange(len(floci)):
    data1[floci[l1]] = data1.loc[:,'genotype'].str[l1]
for l1 in np.arange(len(floci)):
    data1['without_'+floci[l1]] = data1.loc[:,'genotype'].str[:l1] + data1.loc[:,'genotype'].str[l1+1:]

mysize = 1
myalpha = 0.7
elw = 0.5
ealpha = 0.3

tab0 = data1.loc[(data1[floci[l]] == '0')].copy(deep=True).reset_index(drop=True)
tab0 = tab0[['genotype','s','stderr(s)','without_'+floci[l]]]
tab1 = data1.loc[(data1[floci[l]] == '1')].copy(deep=True).reset_index(drop=True)
tab1 = tab1[['genotype','s','stderr(s)','without_'+floci[l]]]

temp = pd.merge(tab0,tab1,how='inner',on='without_'+floci[l])

for i in np.arange(len(floci)):
    temp[floci[i]] = temp.loc[:,'genotype_x'].str[i].astype(int)


for g in np.arange(len(gs)):
    my_x = temp.loc[(temp[bgl[0]] == gs[g][0])&(temp[bgl[1]] == gs[g][1])&(temp[bgl[2]] == gs[g][2]),'s_x']
    my_xerr = temp.loc[(temp[bgl[0]] == gs[g][0])&(temp[bgl[1]] == gs[g][1])&(temp[bgl[2]] == gs[g][2]),'stderr(s)_x']
    my_y = temp.loc[(temp[bgl[0]] == gs[g][0])&(temp[bgl[1]] == gs[g][1])&(temp[bgl[2]] == gs[g][2]),'s_y']
    my_yerr = temp.loc[(temp[bgl[0]] == gs[g][0])&(temp[bgl[1]] == gs[g][1])&(temp[bgl[2]] == gs[g][2]),'stderr(s)_y']
    
    markers0,caps0,bars0 = ax[1][3].errorbar(my_x,my_y,
                            xerr = my_xerr,
                            yerr = my_yerr,
                            alpha=myalpha,linestyle='None',marker='.',ms=mysize,color=colorsnow[g],elinewidth=elw)

    [bar.set_alpha(ealpha) for bar in bars0]

myreg = yorkreg_nocorr(temp['s_x'],temp['s_y'],temp['stderr(s)_x'],temp['stderr(s)_y'],yorkn)

ax[1][3].plot(mainx,mainx*myreg[0]+myreg[1],color=linecolor,lw=mylw,zorder=0)
ax[1][3].plot(mainx,mainx,color='k',lw=mylw,zorder=0)

ax[1][3].set_xlim(xs)
ax[1][3].set_ylim(ys)

for i in np.arange(4):
    for j in np.arange(2):
        ax[j][i].set_xticks([-0.4,0])
        ax[j][i].set_yticks([-0.4,0])
        
fig.text(-0.02,0.25,'Fitness, PMA1 234C',fontsize=7,rotation='vertical')
fig.text(0.37,-0.05,'Fitness, PMA1 234S',fontsize=7)

plt.show()
            
fig.savefig('msfigs/Fig3/PMA14NQOhapadd_wObs_magical_2rows.pdf',bbox_inches='tight',dpi=1000)

## Subsets analysis (for revision)


In [None]:
# First thing: Analyze the Chou data and the Khan data from 2011
# The first step will be to just plot the wt vs mut plots for all 9 mutations, run the regressions, and plot
# these things

chouloci = ['fghA','pntAB','gshA','GB']
khanloci = ['rbs','topA','spoT','pykF','glmUS']

yorkn = 100

data1 = pd.read_csv('20211123_subsets-output/Chou_coeffs.txt',sep='\t',names=['genotype','s_pred','s','stderr(s)'],
                    skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(len(chouloci))
        

# Since we're just using the obs s, remove the pred columns
data2 = data1.drop(columns=['s_pred'])
            
# Create a column for each locus, and for the genotype with that locus removed
for l in np.arange(len(chouloci)):
    data2[chouloci[l]] = data2.loc[:,'genotype'].str[l]
for l in np.arange(len(chouloci)):
    data2['without_'+chouloci[l]] = data2.loc[:,'genotype'].str[:l] + data2.loc[:,'genotype'].str[l+1:]

# get ready to plot all 4 mutations
fig, axes = plt.subplots(nrows=1, ncols=len(chouloci),figsize=(8,1.8))
figd, axesd = plt.subplots(nrows=1, ncols=len(chouloci),figsize=(8,1.8))
    
for l in np.arange(len(chouloci)):
    abc = pd.DataFrame()

    tab0 = data2.loc[(data2[chouloci[l]] == '0')].copy(deep=True).reset_index(drop=True)
    tab0 = tab0[['genotype','s','stderr(s)','without_'+chouloci[l]]]
    tab1 = data2.loc[(data2[chouloci[l]] == '1')].copy(deep=True).reset_index(drop=True)
    tab1 = tab1[['genotype','s','stderr(s)','without_'+chouloci[l]]]

    temp = pd.merge(tab0,tab1,how='inner',on='without_'+chouloci[l])
    #temp['s_diff'] = temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'] - temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x']

    # get the x and y limits
    #plt.scatter(temp['s_x'],temp['s_y'])
    #xs = ajunk[l][e].get_xlim()
    #ys = ajunk[l][e].get_ylim()

    #lower = min(xs[0],ys[0])
    #upper = max(xs[1],ys[1])

    # plot the data
    markers,caps,bars = axes[l].errorbar(temp['s_x'],temp['s_y'],
                                          xerr = temp['stderr(s)_x'],yerr = temp['stderr(s)_y'],alpha=0.7,
                                          linestyle='None',elinewidth=0.5,marker='.',ms=5)

    [bar.set_alpha(0.5) for bar in bars]
    
    xs = axes[l].get_xlim()
    ys = axes[l].get_ylim()

    lower = min(xs[0],ys[0])
    upper = max(xs[1],ys[1])

    # plot x = y
    xlist = np.linspace(lower,upper)
    axes[l].plot(xlist,xlist,c='k',lw=1,zorder=0)

    # get the regression line and plot it
    res = yorkreg_nocorr(temp['s_x'],temp['s_y'],temp['stderr(s)_x'],temp['stderr(s)_y'],yorkn)
    axes[l].plot(np.linspace(lower,upper),np.linspace(lower,upper)*res[0]+res[1],color='xkcd:blue',alpha=1,zorder=0,lw=0.7)

    axes[l].set_xlim(lower,upper)
    axes[l].set_ylim(lower,upper)

    axes[l].text(upper*0.97,lower*1.15,'b = '+str(round(res[0],3)),ha='right')
    axes[l].text(upper*0.97,lower*1.05,chouloci[l],ha='right')
    
    # do ∆s plots too
    temp['s_diff'] = temp['s_y'] - temp['s_x']

    # plot the data
    markers,caps,bars = axesd[l].errorbar(temp['s_x'],temp['s_diff'],
                                          xerr = temp['stderr(s)_x'],yerr = temp['stderr(s)_y'],alpha=0.7,
                                          linestyle='None',elinewidth=0.5,marker='.',ms=5)

    [bar.set_alpha(0.5) for bar in bars]
    
    axesd[l].axhline(y=0,zorder=0,color='k',lw=0.5)
    
    xs = axesd[l].get_xlim()
    ys = axesd[l].get_ylim()

    lower = min(xs[0],ys[0])
    upper = max(xs[1],ys[1])

    # plot x = y
    #xlist = np.linspace(lower,upper)
    #axes[l].plot(xlist,xlist,c='k',lw=1,zorder=0)

    # get the regression line and plot it
    res = linregress(temp['s_x'],temp['s_diff'])
    axesd[l].plot(np.linspace(xs[0],xs[1]),np.linspace(xs[0],xs[1])*res.slope+res.intercept,color='xkcd:blue',alpha=1,zorder=0,lw=0.7)

    #axes[l].set_xlim(lower,upper)
    #axes[l].set_ylim(lower,upper)

    #axes[l].text(upper*0.97,lower*1.15,'b = '+str(round(res[0],3)),ha='right')
    #axes[l].text(upper*0.97,lower*1.05,chouloci[l],ha='right')
    
    
plt.show()

print('************************')
yorkn = 100

data1 = pd.read_csv('20211123_subsets-output/khan_coeffs.txt',sep='\t',names=['genotype','s_pred','s','stderr(s)'],
                    skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(len(khanloci))
        

# Since we're just using the obs s, remove the pred columns
data2 = data1.drop(columns=['s_pred'])
            
# Create a column for each locus, and for the genotype with that locus removed
for l in np.arange(len(khanloci)):
    data2[khanloci[l]] = data2.loc[:,'genotype'].str[l]
for l in np.arange(len(khanloci)):
    data2['without_'+khanloci[l]] = data2.loc[:,'genotype'].str[:l] + data2.loc[:,'genotype'].str[l+1:]

# get ready to plot all mutations
fig, axes = plt.subplots(nrows=1, ncols=len(khanloci),figsize=(10,1.8))
figd, axesd = plt.subplots(nrows=1, ncols=len(khanloci),figsize=(10,1.8))
    
for l in np.arange(len(khanloci)):
    abc = pd.DataFrame()

    tab0 = data2.loc[(data2[khanloci[l]] == '0')].copy(deep=True).reset_index(drop=True)
    tab0 = tab0[['genotype','s','stderr(s)','without_'+khanloci[l]]]
    tab1 = data2.loc[(data2[khanloci[l]] == '1')].copy(deep=True).reset_index(drop=True)
    tab1 = tab1[['genotype','s','stderr(s)','without_'+khanloci[l]]]

    temp = pd.merge(tab0,tab1,how='inner',on='without_'+khanloci[l])
    #temp['s_diff'] = temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_y'] - temp[ploidies[p]+'_'+envts[e]+'_s-pred_'+str(o)+'_x']

    # get the x and y limits
    #plt.scatter(temp['s_x'],temp['s_y'])
    #xs = ajunk[l][e].get_xlim()
    #ys = ajunk[l][e].get_ylim()

    #lower = min(xs[0],ys[0])
    #upper = max(xs[1],ys[1])

    # plot the data
    markers,caps,bars = axes[l].errorbar(temp['s_x'],temp['s_y'],
                                          xerr = temp['stderr(s)_x'],yerr = temp['stderr(s)_y'],alpha=0.7,
                                          linestyle='None',elinewidth=0.5,marker='.',ms=5)

    [bar.set_alpha(0.5) for bar in bars]
    
    xs = axes[l].get_xlim()
    ys = axes[l].get_ylim()

    lower = min(xs[0],ys[0])
    upper = max(xs[1],ys[1])

    # plot x = y
    xlist = np.linspace(lower,upper)
    axes[l].plot(xlist,xlist,c='k',lw=1,zorder=0)

    # get the regression line and plot it
    res = yorkreg_nocorr(temp['s_x'],temp['s_y'],temp['stderr(s)_x'],temp['stderr(s)_y'],yorkn)
    axes[l].plot(np.linspace(lower,upper),np.linspace(lower,upper)*res[0]+res[1],color='xkcd:blue',alpha=1,zorder=0,lw=0.7)

    axes[l].set_xlim(lower,upper)
    axes[l].set_ylim(lower,upper)

    axes[l].text(upper*0.98,lower*1.1,'b = '+str(round(res[0],3)),ha='right')
    axes[l].text(upper*0.98,lower*1.01,khanloci[l],ha='right')
    
    # do ∆s plots too
    temp['s_diff'] = temp['s_y'] - temp['s_x']
    temp['s_diff-err'] = np.sqrt(temp['stderr(s)_x']**2+temp['stderr(s)_y']**2)

    # plot the data
    markers,caps,bars = axesd[l].errorbar(temp['s_x'],temp['s_diff'],
                                          xerr = temp['stderr(s)_x'],yerr = temp['s_diff-err'],alpha=0.7,
                                          linestyle='None',elinewidth=0.5,marker='.',ms=5)

    [bar.set_alpha(0.5) for bar in bars]
    
    axesd[l].axhline(y=0,zorder=0,color='k',lw=0.5)
    
    xs = axesd[l].get_xlim()
    ys = axesd[l].get_ylim()

    lower = min(xs[0],ys[0])
    upper = max(xs[1],ys[1])

    # plot x = y
    #xlist = np.linspace(lower,upper)
    #axes[l].plot(xlist,xlist,c='k',lw=1,zorder=0)

    # get the regression line and plot it
    res = linregress(temp['s_x'],temp['s_diff'])
    axesd[l].plot(np.linspace(xs[0],xs[1]),np.linspace(xs[0],xs[1])*res.slope+res.intercept,color='xkcd:blue',alpha=1,zorder=0,lw=0.7)

    #axes[l].set_xlim(lower,upper)
    #axes[l].set_ylim(lower,upper)

    #axes[l].text(upper*0.97,lower*1.15,'b = '+str(round(res[0],3)),ha='right')
    #axes[l].text(upper*0.97,lower*1.05,khanloci[l],ha='right')
    
    
plt.show()

In [None]:
# Proceed with Khan data.
# First, without getting into coefficients, how does it look if we color the clouds?

yorkn = 100

data1 = pd.read_csv('20211123_subsets-output/khan_coeffs.txt',sep='\t',names=['genotype','s_pred','s','stderr(s)'],
                    skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(len(khanloci))
        

# Since we're just using the obs s, remove the pred columns
data2 = data1.drop(columns=['s_pred'])
            
# Create a column for each locus, and for the genotype with that locus removed
for l in np.arange(len(khanloci)):
    data2[khanloci[l]] = data2.loc[:,'genotype'].str[l]
for l in np.arange(len(khanloci)):
    data2['without_'+khanloci[l]] = data2.loc[:,'genotype'].str[:l] + data2.loc[:,'genotype'].str[l+1:]

# plot all mutations w/ 2 color
fig, axes = plt.subplots(nrows=len(khanloci), ncols=len(khanloci),figsize=(8,8))
    
for l in np.arange(len(khanloci)):
    abc = pd.DataFrame()

    tab0 = data2.loc[(data2[khanloci[l]] == '0')].copy(deep=True).reset_index(drop=True)
    tab0 = tab0[['genotype','s','stderr(s)','without_'+khanloci[l]]]
    tab1 = data2.loc[(data2[khanloci[l]] == '1')].copy(deep=True).reset_index(drop=True)
    tab1 = tab1[['genotype','s','stderr(s)','without_'+khanloci[l]]]

    temp = pd.merge(tab0,tab1,how='inner',on='without_'+khanloci[l])
    
    for l2 in np.arange(len(khanloci)):
        if l2 != l:

            tab00 = data2.loc[(data2[khanloci[l]] == '0')&(data2[khanloci[l2]] == '0')].copy(deep=True).reset_index(drop=True)
            tab01 = data2.loc[(data2[khanloci[l]] == '0')&(data2[khanloci[l2]] == '1')].copy(deep=True).reset_index(drop=True)
            tab10 = data2.loc[(data2[khanloci[l]] == '1')&(data2[khanloci[l2]] == '0')].copy(deep=True).reset_index(drop=True)
            tab11 = data2.loc[(data2[khanloci[l]] == '1')&(data2[khanloci[l2]] == '1')].copy(deep=True).reset_index(drop=True)

            temp0 = pd.merge(tab00,tab10,how='inner',on='without_'+khanloci[l])
            temp1 = pd.merge(tab01,tab11,how='inner',on='without_'+khanloci[l])

            # plot the data
            markers,caps,bars = axes[l2][l].errorbar(temp0['s_x'],temp0['s_y'],
                                                  xerr = temp0['stderr(s)_x'],yerr = temp0['stderr(s)_y'],alpha=0.7,
                                                  linestyle='None',elinewidth=0.5,marker='.',ms=5,color='xkcd:cerulean')

            [bar.set_alpha(0.5) for bar in bars]

            markers,caps,bars = axes[l2][l].errorbar(temp1['s_x'],temp1['s_y'],
                                                  xerr = temp1['stderr(s)_x'],yerr = temp1['stderr(s)_y'],alpha=0.7,
                                                  linestyle='None',elinewidth=0.5,marker='.',ms=5,color='xkcd:orange')

            [bar.set_alpha(0.5) for bar in bars]

            xs = axes[l2][l].get_xlim()
            ys = axes[l2][l].get_ylim()

            lower = min(xs[0],ys[0])
            upper = max(xs[1],ys[1])

            # plot x = y
            xlist = np.linspace(lower,upper)
            axes[l2][l].plot(xlist,xlist,c='k',lw=1,zorder=0)

            # get the regression line and plot it
            res = yorkreg_nocorr(temp['s_x'],temp['s_y'],temp['stderr(s)_x'],temp['stderr(s)_y'],yorkn)
            axes[l2][l].plot(np.linspace(lower,upper),np.linspace(lower,upper)*res[0]+res[1],color='xkcd:goldenrod',alpha=1,zorder=0,lw=0.7)

            res = yorkreg_nocorr(temp0['s_x'],temp0['s_y'],temp0['stderr(s)_x'],temp0['stderr(s)_y'],yorkn)
            axes[l2][l].plot(np.linspace(lower,upper),np.linspace(lower,upper)*res[0]+res[1],color='xkcd:cerulean',alpha=1,zorder=0,lw=0.7)

            res = yorkreg_nocorr(temp1['s_x'],temp1['s_y'],temp1['stderr(s)_x'],temp1['stderr(s)_y'],yorkn)
            axes[l2][l].plot(np.linspace(lower,upper),np.linspace(lower,upper)*res[0]+res[1],color='xkcd:orange',alpha=1,zorder=0,lw=0.7)

            axes[l2][l].set_xlim(lower,upper)
            axes[l2][l].set_ylim(lower,upper)
            
            if l2 == 0:
                axes[l2][l].set_xlabel(khanloci[l])
            
            if l == 0:
                axes[l2][l].set_ylabel(khanloci[l2])

            #axes[l2][l].text(upper*0.98,lower*1.1,'b = '+str(round(res[0],3)),ha='right')
            #axes[l2][l].text(upper*0.98,lower*1.01,khanloci[l],ha='right')

    
plt.show()



In [None]:
# Having seen that we get some clustering, let's do a proper "remover" analysis on these bad boys from Khan et al 2011

#time the code
mystart = time.perf_counter()

megaremoverkhan = pd.DataFrame()
ncyc = 30
ncycthresh = 10
#pthresh = 0.01
yorkn = 100
#diffthresh = 0.01
ethresh = 0.50

# First, import a list of genotypes to predict
data1 = pd.read_csv('20211123_subsets-output/khan_coeffs.txt',sep='\t',names=['genotype','s_pred','s','stderr(s)'],
                    skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(len(khanloci))
            
# Create a column for each locus, and for the genotype with that locus removed
for l in np.arange(len(khanloci)):
    data1[khanloci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
    data1.loc[(data1[khanloci[l]] == 0),khanloci[l]] = -1

# Calculate the difference between observed and predicted, such that s_obs = s_pred + diff
data1['opdiff'] = data1['s'] - data1['s_pred']

# Now import a set of coefficients
data0 = pd.read_csv('20211123_subsets-output/khan_coeffs.txt',sep='\t',names=['todelete','genotype','term','na'],
                    skiprows=2,skip_blank_lines=False)

data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(len(khanloci))

data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

for l in np.arange(len(khanloci)):
    data0[khanloci[l]] = data0.loc[:,'genotype'].str[l].astype(int)
    data1['without_'+khanloci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

data0['numMut'] = data0[khanloci].sum(axis=1)

# Add a "baseline" column for whether these terms are added or subtracted in the WT
for i in np.arange(len(data0)):
    if data0.loc[i,'numMut'] % 2 == 1:
        data0.at[i,'baseline'] = -1
    else:
        data0.at[i,'baseline'] = 1

#for l in np.arange(0,1):
for l in np.arange(len(khanloci)):

    # Want to start with "all-epistasis" observed fitnesses in data1
    # Get b_obs, TLS_b_obs, TLS_1 (and intercept values)
    tab0 = data1.loc[(data1[khanloci[l]] == -1)].copy(deep=True).reset_index(drop=True)
    tab0 = tab0[['genotype','s','stderr(s)','without_'+khanloci[l]]]
    tab1 = data1.loc[(data1[khanloci[l]] == 1)].copy(deep=True).reset_index(drop=True)
    tab1 = tab1[['genotype','s','stderr(s)','without_'+khanloci[l]]]

    templ = pd.merge(tab0,tab1,how='inner',on='without_'+khanloci[l])

    for i in np.arange(len(khanloci)):
        templ[khanloci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

    # York regression to get sum of total least squares deviations, S
    myreg = yorkreg_nocorr(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
    myreg1 = york_slope1(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)

    # Create a subtable - we'll append these together into megaremoverkhan
    locusremoverkhan = pd.DataFrame()

    # wrap up the below, haven't edited yet...
    locusremoverkhan.at[0,'num term add'] = 0
    locusremoverkhan.at[0,'term added'] = 'na'
    locusremoverkhan.at[0,'term order'] = 'na'
    locusremoverkhan.at[0,'coefficient'] = 'na'
    locusremoverkhan.at[0,'inferred_b'] = myreg[0]
    locusremoverkhan.at[0,'inferred_a'] = myreg[1]
    locusremoverkhan.at[0,'inferred_S'] = myreg[3]
    locusremoverkhan.at[0,'main_b'] = myreg[0]
    locusremoverkhan.at[0,'main_a'] = myreg[1]
    locusremoverkhan.at[0,'main_S'] = myreg[3]
    locusremoverkhan.at[0,'N'] = len(templ)
    locusremoverkhan.at[0,'1_a'] = myreg1[0]
    locusremoverkhan.at[0,'1_S'] = myreg1[1]

    # OPTION TO CONSIDER: ADD IN ∆S FORMULATIONS, DIVERGENCE FROM SLOPE OF 0

    # Begin cycles
    # data0adder serves as the sorted databank for (non-zero) coefficients
    data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[khanloci[l]] == 1)]
    data0adder['abs coefficient'] = abs(data0adder['term'])
    data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)

    # create a builder dataframe on which changes will be processed
    builder = data0.copy(deep=True)

    for n in np.arange(ncyc):
    #for n in np.arange(0,1):
        # find the top index (i.e., the strongest coefficient)
        topind = data0adder.index[n]

        # remove the strongest epistatic coefficient that involves locus l
        builder.at[topind,'term'] = 0

        # add coefficient info in locusadder
        locusremoverkhan.at[n+1,'num term add'] = n+1
        locusremoverkhan.at[n+1,'term added'] = data0adder.loc[topind,'genotype'] 
        locusremoverkhan.at[n+1,'term order'] = data0adder.loc[topind,'numMut']
        locusremoverkhan.at[n+1,'coefficient'] = data0adder.loc[topind,'term']

        # Estimate the new predicted value for each genotype
        for g in np.arange(len(data1)):
            temp = builder.copy(deep=True)
            # to make life faster, remove all zero values
            temp = temp.loc[(temp['term'] != 0)].reset_index(drop=True)
            for t in np.arange(len(temp)):
                temprowlist = []
                for locus in np.arange(len(khanloci)):
                    temprowlist = temprowlist + [temp.loc[t,khanloci[locus]]*data1.loc[g,khanloci[locus]]]
                #remove zeros
                temprowlist = [value for value in temprowlist if value != 0]
                # find product
                firstprod = np.prod(temprowlist)
                # subtract out baseline
                firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                # multiply by term's value
                mytermval = firstprod_lessbaseline * temp.loc[t,'term']
                temp.at[t,'tosum'] = mytermval
            totalsum = temp['tosum'].sum()
            myintercept = data1.loc[(data1['genotype'] == '00000'),'s_pred'].values[0]
            data1.at[g,'new_pred'] = totalsum + myintercept
        data1['new_obs'] = data1['new_pred'] + data1['opdiff']

        # For the new predictions, get the values I want
        tab0 = data1.loc[(data1[khanloci[l]] == -1)].copy(deep=True).reset_index(drop=True)
        tab0 = tab0[['genotype','new_obs','stderr(s)','without_'+khanloci[l]]]
        tab1 = data1.loc[(data1[khanloci[l]] == 1)].copy(deep=True).reset_index(drop=True)
        tab1 = tab1[['genotype','new_obs','stderr(s)','without_'+khanloci[l]]]

        templ = pd.merge(tab0,tab1,how='inner',on='without_'+khanloci[l])

        for i in np.arange(len(khanloci)):
            templ[khanloci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

        my_x = templ['new_obs_x']
        my_y = templ['new_obs_y']
        my_x_err = templ['stderr(s)_x']
        my_y_err = templ['stderr(s)_y']

        myregnew = yorkreg_nocorr(my_x,my_y,my_x_err,my_y_err,yorkn)
        locusremoverkhan.at[n+1,'inferred_b'] = myregnew[0]
        locusremoverkhan.at[n+1,'inferred_a'] = myregnew[1]
        locusremoverkhan.at[n+1,'inferred_S'] = myregnew[3]

        myreg_main = york_slopeanyb(my_x,my_y,my_x_err,my_y_err,yorkn,myreg[0])
        locusremoverkhan.at[n+1,'main_b'] = myreg[0]
        locusremoverkhan.at[n+1,'main_a'] = myreg_main[0]
        locusremoverkhan.at[n+1,'main_S'] = myreg_main[1]

        myreg_1 = york_slope1(my_x,my_y,my_x_err,my_y_err,yorkn)
        locusremoverkhan.at[n+1,'1_a'] = myreg_1[0]
        locusremoverkhan.at[n+1,'1_S'] = myreg_1[1]

        locusremoverkhan.at[n+1,'N'] = len(templ)

        print(abs(locusremoverkhan.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum())

        #if n >= ncycthresh:
        #    if (abs(locusremoverkhan.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum()) > ethresh:
        #        break
        if data0adder.loc[topind,'term'] == 0:
            break

    locusremoverkhan.at[1:,'term added'] = locusremoverkhan.loc[1:,'term added'].astype(int).astype(str).str.zfill(len(khanloci))
    locusremoverkhan.insert(0,'main locus', khanloci[l])
    #locusremoverkhan.insert(0,'envt',envts[e])
    #locusremoverkhan.insert(0,'ploidy',ploidies[p])

    megaremoverkhan = megaremoverkhan.append(locusremoverkhan)
    print(khanloci[l])

mystop = time.perf_counter()
elapsed = mystop-mystart
print(str(elapsed)+' seconds to run')
                
export_csv = megaremoverkhan.to_csv(r'20210712_megaremoverkhan.csv',index=True,header=True) 


In [None]:
## Analyze the megaremoverv2
mr2 = pd.read_csv('20210712_megaremoverkhan.csv')
mr2 = mr2.drop(columns=['Unnamed: 0'])

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2)):
    if mr2.loc[i,'main_b'] <= 1:
        mr2.at[i,'main_b_-1to1'] = mr2.loc[i,'main_b']
    else:
        mr2.at[i,'main_b_-1to1'] = 1/mr2.loc[i,'main_b']
    
for i in np.arange(len(mr2)):
    if mr2.loc[i,'inferred_b'] <= 1:
        mr2.at[i,'inferred_b_-1to1'] = mr2.loc[i,'inferred_b']
    else:
        mr2.at[i,'inferred_b_-1to1'] = 1/mr2.loc[i,'inferred_b']

# Take ratios of the S values
mr2['main/inferred'] = mr2['main_S']/mr2['inferred_S']
mr2['1/inferred'] = mr2['1_S']/mr2['inferred_S']
mr2['1/main'] = mr2['1_S']/mr2['main_S']

# Plot ratios
for l in np.arange(len(khanloci)):
    mr2sub = mr2.loc[(mr2['main locus'] == khanloci[l])].copy(deep=True).reset_index(drop=True)
    plt.plot(mr2sub['num term add'],mr2sub['1/main'])
plt.axhline(y=1,color='xkcd:grey',zorder=0)
plt.legend(khanloci)
plt.xlabel('num term removed')
plt.ylabel('1/main')
plt.show()

# do same thing for 1/inferred
for l in np.arange(len(khanloci)):
    mr2sub = mr2.loc[(mr2['main locus'] == khanloci[l])].copy(deep=True).reset_index(drop=True)
    plt.plot(mr2sub['num term add'],mr2sub['1/inferred'])
plt.axhline(y=1,color='xkcd:grey',zorder=0)
plt.legend(khanloci)
plt.xlabel('num term removed')
plt.ylabel('1/inferred')
plt.show()

# do same thing for main/inferred
for l in np.arange(len(khanloci)):
    mr2sub = mr2.loc[(mr2['main locus'] == khanloci[l])].copy(deep=True).reset_index(drop=True)
    plt.plot(mr2sub['num term add'],mr2sub['main/inferred'])
plt.axhline(y=1,color='xkcd:grey',zorder=0)
plt.legend(khanloci)
plt.xlabel('num term removed')
plt.ylabel('main/inferred')
plt.show()



In [None]:
# do adding analysis, this time with just pykF since it seems to have the strangest behavior
megaadder = pd.DataFrame()
o=10
ncyc = 30
ncycthresh = 2
#pthresh = 0.01
yorkn = 100
diffthresh = 0.01


data1 = pd.read_csv('20211123_subsets-output/khan_coeffs.txt',sep='\t',names=['genotype','s_pred','s','stderr(s)'],
                    skiprows=2,skip_blank_lines=False)

# Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

# Binary style for genotype
data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(len(khanloci))
            
# Create a column for each locus, and for the genotype with that locus removed
for l in np.arange(len(khanloci)):
    data1[khanloci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
    data1.loc[(data1[khanloci[l]] == 0),khanloci[l]] = -1

# Calculate the difference between observed and predicted, such that s_obs = s_pred + diff
data1['opdiff'] = data1['s'] - data1['s_pred']

# Now import a set of coefficients
data0 = pd.read_csv('20211123_subsets-output/khan_coeffs.txt',sep='\t',names=['todelete','genotype','term','na'],
                    skiprows=2,skip_blank_lines=False)

data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(len(khanloci))

data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

for l in np.arange(len(khanloci)):
    data0[khanloci[l]] = data0.loc[:,'genotype'].str[l].astype(int)
    data1['without_'+khanloci[l]] = data1.loc[:,'genotype'].str[:l] + data1.loc[:,'genotype'].str[l+1:]

data0['numMut'] = data0[khanloci].sum(axis=1)

# Add a "baseline" column for whether these terms are added or subtracted in the WT
for i in np.arange(len(data0)):
    if data0.loc[i,'numMut'] % 2 == 1:
        data0.at[i,'baseline'] = -1
    else:
        data0.at[i,'baseline'] = 1

for l in np.arange(3,4):
#for l in np.arange(len(khanloci)):
    
    # plot to start out, as sanity check
    tab0 = data1.loc[(data1[khanloci[l]] == -1)].copy(deep=True).reset_index(drop=True)
    tab0 = tab0[['genotype','s_pred','without_'+khanloci[l]]]
    tab1 = data1.loc[(data1[khanloci[l]] == 1)].copy(deep=True).reset_index(drop=True)
    tab1 = tab1[['genotype','s_pred','without_'+khanloci[l]]]
    
    templ = pd.merge(tab0,tab1,how='inner',on='without_'+khanloci[l])
    
    my_x = templ['s_pred_x']
    my_y = templ['s_pred_y']

    myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
    print('york slope = '+str(myreg[0]))

    # Want to start with "no-epistasis" predictions in data1
    # builder is where we build up coefficients for fitness predictions
    # Start by setting all epistatic terms INVOLVING THE FOCAL LOCUS to 0

    builder = data0.copy(deep=True)
    builder.at[(builder['numMut'] > 1)&(builder[khanloci[l]] == 1),'term'] = 0

    for g in np.arange(len(data1)):
        temp = builder.copy(deep=True)
        # to make life faster, remove all zero values
        temp = temp.loc[(temp['term'] != 0)].reset_index(drop=True)
        for t in np.arange(len(temp)):
            temprowlist = []
            for locus in np.arange(len(khanloci)):
                temprowlist = temprowlist + [temp.loc[t,khanloci[locus]]*data1.loc[g,khanloci[locus]]]
            #remove zeros
            temprowlist = [value for value in temprowlist if value != 0]
            # find product
            firstprod = np.prod(temprowlist)
            # subtract out baseline
            firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
            # multiply by term's value
            mytermval = firstprod_lessbaseline * temp.loc[t,'term']
            temp.at[t,'tosum'] = mytermval
        totalsum = temp['tosum'].sum()
        myintercept = data1.loc[(data1['genotype'] == '00000'),'s_pred'].values[0]
        data1.at[g,'noEp_pred'] = totalsum + myintercept

    locusadder = pd.DataFrame()

    tab0 = data1.loc[(data1[khanloci[l]] == -1)].copy(deep=True).reset_index(drop=True)
    tab0 = tab0[['genotype','noEp_pred','without_'+khanloci[l]]]
    tab1 = data1.loc[(data1[khanloci[l]] == 1)].copy(deep=True).reset_index(drop=True)
    tab1 = tab1[['genotype','noEp_pred','without_'+khanloci[l]]]

    templ = pd.merge(tab0,tab1,how='inner',on='without_'+khanloci[l])

    for i in np.arange(len(khanloci)):
        templ[khanloci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

    # Get the 1:1 R^2 value
    my_x = templ['noEp_pred_x']
    my_y = templ['noEp_pred_y']

    locusadder.at[0,'num term add'] = 0
    locusadder.at[0,'term added'] = np.nan
    locusadder.at[0,'term order'] = np.nan
    locusadder.at[0,'coefficient'] = np.nan
    #locusadder.at[0,'1:1 R2'] = 1-RSS/TSS

    # Perform standard least-squares linear regression
    #myreg = linregress(my_x,my_y)
    myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
    locusadder.at[0,'abc-original_slope'] = myreg[0]
    locusadder.at[0,'abc-original_intercept'] = myreg[1]
    myreg = yorkreg_nocorr(my_x,my_y-my_x,[0]*len(my_x),[0]*len(my_y),yorkn)
    locusadder.at[0,'deltas-original_slope'] = myreg[0]
    locusadder.at[0,'deltas-original_intercept'] = myreg[1]
    myreg = yorkreg_nocorr(my_y,my_x-my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
    locusadder.at[0,'deltas-reversion_slope'] = myreg[0]
    locusadder.at[0,'deltas-reversion_intercept'] = myreg[1]
    #locusadder.at[0,'linreg r2'] = myreg.rvalue**2
    #locusadder.at[0,'stderr_slope'] = myreg.stderr
    #locusadder.at[0,'stderr_intercept'] = myreg.intercept_stderr
    # Compare this slope to 1
    #myt = (myreg.slope - 1) / (myreg.stderr - 0)
    #locusadder.at[0,'t_stat'] = myt
    #mydf = len(templ) - 2
    # two-sided t test p value
    #myp = stats.t.sf(np.abs(myt), mydf)*2
    #locusadder.at[0,'p_val'] = myp

    #plt.scatter(my_x,my_y)
    #plt.show()

    # Begin cycles
    data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[khanloci[l]] == 1)]
    data0adder['abs coefficient'] = abs(data0adder['term'])
    data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)

    for n in np.arange(ncyc):
        # add the strongest epistatic coefficient that involves locus l
        topind = data0adder.index[0]
        builder.at[topind,'term'] = data0adder.loc[topind,'term']

        # log coefficient info in locusadder
        locusadder.at[n+1,'num term add'] = n+1
        locusadder.at[n+1,'term added'] = data0adder.loc[topind,'genotype'] 
        locusadder.at[n+1,'term order'] = data0adder.loc[topind,'numMut']
        locusadder.at[n+1,'coefficient'] = data0adder.loc[topind,'term']

        # Estimate the new predicted value for each genotype
        for g in np.arange(len(data1)):
            temp = builder.copy(deep=True)
            # to make life faster, remove all zero values
            temp = temp.loc[(temp['term'] != 0)].reset_index(drop=True)
            for t in np.arange(len(temp)):
                temprowlist = []
                for locus in np.arange(len(khanloci)):
                    temprowlist = temprowlist + [temp.loc[t,khanloci[locus]]*data1.loc[g,khanloci[locus]]]
                #remove zeros
                temprowlist = [value for value in temprowlist if value != 0]
                # find product
                firstprod = np.prod(temprowlist)
                # subtract out baseline
                firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                # multiply by term's value
                mytermval = firstprod_lessbaseline * temp.loc[t,'term']
                temp.at[t,'tosum'] = mytermval
            totalsum = temp['tosum'].sum()
            myintercept = data1.loc[(data1['genotype'] == '00000'),'s_pred'].values[0]
            data1.at[g,'new_pred'] = totalsum + myintercept

        # For the new predictions, get the values I want
        tab0 = data1.loc[(data1[khanloci[l]] == -1)].copy(deep=True).reset_index(drop=True)
        tab0 = tab0[['genotype','new_pred','without_'+khanloci[l]]]
        tab1 = data1.loc[(data1[khanloci[l]] == 1)].copy(deep=True).reset_index(drop=True)
        tab1 = tab1[['genotype','new_pred','without_'+khanloci[l]]]

        templ = pd.merge(tab0,tab1,how='inner',on='without_'+khanloci[l])

        for i in np.arange(len(khanloci)):
            templ[khanloci[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

        # Get the 1:1 R^2 value
        my_x = templ['new_pred_x']
        my_y = templ['new_pred_y']
        #my_y_mean = np.mean(my_y)
        #my_x_mean = np.mean(my_x)
        #TSS = np.sum((my_y - my_y_mean)**2)
        #RSS = np.sum((my_y - my_y_mean - my_x + my_x_mean)**2)

        #locusadder.at[n+1,'1:1 R2'] = 1-RSS/TSS

        # Perform standard least-squares linear regression
        #myreg = linregress(my_x,my_y)
        myreg = yorkreg_nocorr(my_x,my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
        locusadder.at[n+1,'abc-original_slope'] = myreg[0]
        locusadder.at[n+1,'abc-original_intercept'] = myreg[1]
        myreg = yorkreg_nocorr(my_x,my_y-my_x,[0]*len(my_x),[0]*len(my_y),yorkn)
        locusadder.at[n+1,'deltas-original_slope'] = myreg[0]
        locusadder.at[n+1,'deltas-original_intercept'] = myreg[1]
        myreg = yorkreg_nocorr(my_y,my_x-my_y,[0]*len(my_x),[0]*len(my_y),yorkn)
        locusadder.at[n+1,'deltas-reversion_slope'] = myreg[0]
        locusadder.at[n+1,'deltas-reversion_intercept'] = myreg[1]
        #locusadder.at[n+1,'slope'] = myreg.slope
        #locusadder.at[n+1,'intercept'] = myreg.intercept
        #locusadder.at[n+1,'linreg r2'] = myreg.rvalue**2
        #locusadder.at[n+1,'stderr_slope'] = myreg.stderr
        #locusadder.at[n+1,'stderr_intercept'] = myreg.intercept_stderr
        # Compare this slope to 1
        #myt = (myreg.slope - 1) / (myreg.stderr - 0)
        #locusadder.at[n+1,'t_stat'] = myt
        #mydf = len(templ) - 2
        # two-sided t test p value
        #myp = stats.t.sf(np.abs(myt), mydf)*2
        #locusadder.at[n+1,'p_val'] = myp

        # Remove strongest epistatic coefficient from data0adder
        data0adder = data0adder[1:]

        #plt.scatter(my_x,my_y)
        #plt.plot(np.linspace(-0.4,0.4),np.linspace(-0.4,0.4))
        #plt.plot(np.linspace(-0.4,0.4),np.linspace(-0.4,0.4)*myreg[0]+myreg[1])
        #plt.show()


        if n >= ncycthresh:
            diff3 = abs(locusadder.loc[n+1,'abc-original_slope'] - locusadder.loc[n-2,'abc-original_slope'])
            diff2 = abs(locusadder.loc[n+1,'abc-original_slope'] - locusadder.loc[n-1,'abc-original_slope'])
            diff1 = abs(locusadder.loc[n+1,'abc-original_slope'] - locusadder.loc[n,'abc-original_slope'])

            if diff3 <= diffthresh and diff2 <= diffthresh and diff1 <= diffthresh:
                diff3do = abs(locusadder.loc[n+1,'deltas-original_slope'] - locusadder.loc[n-2,'deltas-original_slope'])
                diff2do = abs(locusadder.loc[n+1,'deltas-original_slope'] - locusadder.loc[n-1,'deltas-original_slope'])
                diff1do = abs(locusadder.loc[n+1,'deltas-original_slope'] - locusadder.loc[n,'deltas-original_slope'])

                if diff3do <= diffthresh and diff2do <= diffthresh and diff1do <= diffthresh:
                    diff3dr = abs(locusadder.loc[n+1,'deltas-reversion_slope'] - locusadder.loc[n-2,'deltas-reversion_slope'])
                    diff2dr = abs(locusadder.loc[n+1,'deltas-reversion_slope'] - locusadder.loc[n-1,'deltas-reversion_slope'])
                    diff1dr = abs(locusadder.loc[n+1,'deltas-reversion_slope'] - locusadder.loc[n,'deltas-reversion_slope'])

                    if diff3dr <= diffthresh and diff2dr <= diffthresh and diff1dr <= diffthresh:
                        break

    locusadder.at[1:,'term added'] = locusadder.loc[1:,'term added'].astype(int).astype(str).str.zfill(len(khanloci))
    locusadder.insert(0,'main locus', khanloci[l])
    megaadder = megaadder.append(locusadder)
    print(khanloci[l])

In [6]:
# get a list of all the subset numbers
ssnames = pd.read_csv('20211123_subsets-output/subsets_filenames.txt')
ssnums = ssnames['filename'].str[15:].str.split('.').str[0]
ssnums = list(ssnums.astype(int).sort_values().reset_index(drop=True))

In [None]:
# Do the removal analysis now with the subsets data

# Want to do ABC vs aBC plots now where, instead of adding terms by rank to form predicted values,
# we subtract them by rank to adjust observed values.
#time the code
mystart = time.perf_counter()

megaremoverss = pd.DataFrame()
#ss stands for subset

o=10
ncyc = 7
ncycthresh = 10
#pthresh = 0.01
yorkn = 100
#diffthresh = 0.01
ethresh = 0.50

#for ss in np.arange(0,1):
for ss in np.arange(len(ssnums)):
    
    ## for now, just do this for one in every 5!
    #if ss % 5 == 1:
    
        megaremoverv2 = pd.DataFrame()
        myssnum = ssnums[ss]
        print(str(myssnum))

        for p in np.arange(0,1):
        #for p in np.arange(len(ploidies)):
            for e in np.arange(1,2):
            #for e in np.arange(len(envts)):

                #print(ploidies[p]+'_'+envts[e])

                # First, import a list of genotypes to predict
                data1 = pd.read_csv('20211123_subsets-output/subsets/'+ploidies[p]+'_'+envts[e]+'_subset_'+str(myssnum)+'.txt',
                                    sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],
                                    skiprows=2,skip_blank_lines=False)

                # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
                data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

                # Binary style for genotype
                data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

                # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
                # represented multiple times.
                glist = list(OrderedDict.fromkeys(data1['genotype']))

                data2 = pd.DataFrame()

                for g in np.arange(len(glist)):
                    tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                    if len(tempg) == 1:
                        data2.at[g,'genotype'] = glist[g]
                        data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
                        data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
                    elif len(tempg) > 1:
                        data2.at[g,'genotype'] = glist[g]
                        data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
                        #my_svar = statistics.variance(tempg['s'])
                        my_svar = 0
                        mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
                        data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                    data2.at[g,'s_pred'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_Alex prediction']

                data1 = data2

                # Calculate the difference between observed and predicted, such that s_obs = s_pred + diff
                data1['opdiff'] = data1['s'] - data1['s_pred']

                # Create a column for each locus
                for l in np.arange(len(floci)):
                    data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
                    data1.loc[(data1[floci[l]] == 0),floci[l]] = -1

                locihere = []
                for l in np.arange(len(floci)):
                    if len(data1.loc[data1[floci[l]] == 1]) > 0:
                        locihere = locihere + [floci[l]]
                    
                
                # Now import a set of coefficients
                data0 = pd.read_csv('20211123_subsets-output/subsets/'+ploidies[p]+'_'+envts[e]+'_subset_'+str(myssnum)+'.txt',
                                                     sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_term','na'],skiprows=2,skip_blank_lines=False)

                data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

                data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(len(locihere))

                data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

                for l in np.arange(len(locihere)):
                    data0[locihere[l]] = data0.loc[:,'genotype'].str[l].astype(int)
                    data1['without_'+locihere[l]] = data1.loc[:,'genotype'].str[:floci.index(locihere[l])] + data1.loc[:,'genotype'].str[floci.index(locihere[l])+1:]

                data0['numMut'] = data0[locihere].sum(axis=1)

                # Add a "baseline" column for whether these terms are added or subtracted in the WT
                for i in np.arange(len(data0)):
                    if data0.loc[i,'numMut'] % 2 == 1:
                        data0.at[i,'baseline'] = -1
                    else:
                        data0.at[i,'baseline'] = 1

                #for l in np.arange(0,1):
                for l in np.arange(len(locihere)):

                    # Want to start with "all-epistasis" observed fitnesses in data1
                    # Get b_obs, TLS_b_obs, TLS_1 (and intercept values)
                    tab0 = data1.loc[(data1[locihere[l]] == -1)].copy(deep=True).reset_index(drop=True)
                    tab0 = tab0[['genotype','s','stderr(s)','without_'+locihere[l]]]
                    tab1 = data1.loc[(data1[locihere[l]] == 1)].copy(deep=True).reset_index(drop=True)
                    tab1 = tab1[['genotype','s','stderr(s)','without_'+locihere[l]]]

                    templ = pd.merge(tab0,tab1,how='inner',on='without_'+locihere[l])

                    for i in np.arange(len(locihere)):
                        templ[locihere[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

                    # York regression to get sum of total least squares deviations, S
                    myreg = yorkreg_nocorr(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
                    myreg1 = york_slope1(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)

                    #TEMPORARY
                    ## plot
                    #plt.errorbar(templ['s_x'],templ['s_y'],xerr=templ['stderr(s)_x'],yerr=templ['stderr(s)_y'],ls='none')
                    #ys = plt.gca().get_ylim()
                    #xs = plt.gca().get_xlim()
                    #lower = min(xs[0],ys[0])
                    #upper = max(xs[1],ys[1])
                    #plt.plot(np.linspace(lower,upper),np.linspace(lower,upper),color='k')
                    #plt.plot(np.linspace(lower,upper),np.linspace(lower,upper)*myreg[0]+myreg[1],color='xkcd:cerulean')
                    #plt.plot(np.linspace(lower,upper),np.linspace(lower,upper)+myreg1[0],color='xkcd:goldenrod')
                    #plt.xlim(upper,lower)
                    #plt.ylim(upper,lower)
                    #plt.show()

                    # Create a subtable - we'll append these together into megaremoverv2
                    locusremover = pd.DataFrame()

                    # wrap up the below, haven't edited yet...
                    locusremover.at[0,'num term add'] = 0
                    locusremover.at[0,'term added'] = 'na'
                    locusremover.at[0,'term order'] = 'na'
                    locusremover.at[0,'coefficient'] = 'na'
                    locusremover.at[0,'inferred_b'] = myreg[0]
                    locusremover.at[0,'inferred_a'] = myreg[1]
                    locusremover.at[0,'inferred_S'] = myreg[3]
                    locusremover.at[0,'main_b'] = myreg[0]
                    locusremover.at[0,'main_a'] = myreg[1]
                    locusremover.at[0,'main_S'] = myreg[3]
                    locusremover.at[0,'N'] = len(templ)
                    locusremover.at[0,'1_a'] = myreg1[0]
                    locusremover.at[0,'1_S'] = myreg1[1]


                    # Begin cycles
                    # data0adder serves as the sorted databank for (non-zero) coefficients
                    data0adder = data0.copy(deep=True).loc[(data0['numMut'] > 1)&(data0[locihere[l]] == 1)]
                    data0adder['abs coefficient'] = abs(data0adder[ploidies[p]+'_'+envts[e]+'_term'])
                    data0adder = data0adder.sort_values(by='abs coefficient',ascending=False)

                    # create a builder dataframe on which changes will be processed
                    builder = data0.copy(deep=True)

                    for n in np.arange(ncyc):
                    #for n in np.arange(0,1):
                        # find the top index (i.e., the strongest coefficient)
                        if n > len(data0adder)-1:
                            break

                        topind = data0adder.index[n]

                        # if no epistasis, end
                        if data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term'] == 0:
                            break

                        # remove the strongest epistatic coefficient that involves locus l
                        builder.at[topind,ploidies[p]+'_'+envts[e]+'_term'] = 0

                        # add coefficient info in locusadder
                        locusremover.at[n+1,'num term add'] = n+1
                        locusremover.at[n+1,'term added'] = data0adder.loc[topind,'genotype'] 
                        locusremover.at[n+1,'term order'] = data0adder.loc[topind,'numMut']
                        locusremover.at[n+1,'coefficient'] = data0adder.loc[topind,ploidies[p]+'_'+envts[e]+'_term']

                        # Estimate the new predicted value for each genotype
                        for g in np.arange(len(data1)):
                            temp = builder.copy(deep=True)
                            # to make life faster, remove all zero values
                            temp = temp.loc[(temp[ploidies[p]+'_'+envts[e]+'_term'] != 0)].reset_index(drop=True)
                            # occasionally, we are in all-zero-epistasis world (edge case), so if that's the case just use all the terms
                            if len(temp) == 0:
                                temp = builder.copy(deep=True)
                            for t in np.arange(len(temp)):
                                temprowlist = []
                                for locus in np.arange(len(locihere)):
                                    temprowlist = temprowlist + [temp.loc[t,locihere[locus]]*data1.loc[g,locihere[locus]]]
                                #remove zeros
                                temprowlist = [value for value in temprowlist if value != 0]
                                # find product
                                firstprod = np.prod(temprowlist)
                                # subtract out baseline
                                firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                                # multiply by term's value
                                mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                                temp.at[t,'tosum'] = mytermval
                            totalsum = temp['tosum'].sum()
                            myintercept = data1.loc[(data1['genotype'] == '0000000000'),'s_pred'].values[0]
                            data1.at[g,'new_pred'] = totalsum + myintercept
                        data1['new_obs'] = data1['new_pred'] + data1['opdiff']

                        # For the new predictions, get the values I want
                        tab0 = data1.loc[(data1[locihere[l]] == -1)].copy(deep=True).reset_index(drop=True)
                        tab0 = tab0[['genotype','new_obs','stderr(s)','without_'+locihere[l]]]
                        tab1 = data1.loc[(data1[locihere[l]] == 1)].copy(deep=True).reset_index(drop=True)
                        tab1 = tab1[['genotype','new_obs','stderr(s)','without_'+locihere[l]]]

                        templ = pd.merge(tab0,tab1,how='inner',on='without_'+locihere[l])

                        #for i in np.arange(len(locihere)):
                        #    templ[locihere[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

                        my_x = templ['new_obs_x']
                        my_y = templ['new_obs_y']
                        my_x_err = templ['stderr(s)_x']
                        my_y_err = templ['stderr(s)_y']

                        myregnew = yorkreg_nocorr(my_x,my_y,my_x_err,my_y_err,yorkn)
                        locusremover.at[n+1,'inferred_b'] = myregnew[0]
                        locusremover.at[n+1,'inferred_a'] = myregnew[1]
                        locusremover.at[n+1,'inferred_S'] = myregnew[3]

                        myreg_main = york_slopeanyb(my_x,my_y,my_x_err,my_y_err,yorkn,myreg[0])
                        locusremover.at[n+1,'main_b'] = myreg[0]
                        locusremover.at[n+1,'main_a'] = myreg_main[0]
                        locusremover.at[n+1,'main_S'] = myreg_main[1]

                        myreg_1 = york_slope1(my_x,my_y,my_x_err,my_y_err,yorkn)
                        locusremover.at[n+1,'1_a'] = myreg_1[0]
                        locusremover.at[n+1,'1_S'] = myreg_1[1]

                        locusremover.at[n+1,'N'] = len(templ)

                        print(abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum())

                        #if n >= ncycthresh:
                        #    if (abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum()) > ethresh:
                        #        break

                    locusremover.at[1:,'term added'] = locusremover.loc[1:,'term added'].astype(int).astype(str).str.zfill(len(locihere))
                    locusremover.insert(0,'main locus', locihere[l])
                    locusremover.insert(0,'envt',envts[e])
                    locusremover.insert(0,'ploidy',ploidies[p])

                    megaremoverv2 = megaremoverv2.append(locusremover)
                    print(locihere[l])
        megaremoverv2.insert(0,'subset',myssnum)
        megaremoverss = megaremoverss.append(megaremoverv2)
        # save as I go so I don't lose if computer crashes
        export_csv = megaremoverss.to_csv(r'20210712_megaremoverss_hap_4nqo-ncyc7.csv',index=True,header=True) 

mystop = time.perf_counter()
elapsed = mystop-mystart
print(str(elapsed)+' seconds to run')

In [None]:
# Do the same megaremoverss thing as above except take a shortcut: only interested in residuals (endpoint)
# Should go much much faster and can give us data on all subsets everywhere

# Do the removal analysis now with the subsets data

# Want to do ABC vs aBC plots now where, instead of adding terms by rank to form predicted values,
# we subtract them by rank to adjust observed values.
#time the code

THIS ACTUALLY TAKES QUITE A LONG TIME TO RUN

mystart = time.perf_counter()

megaremoverss = pd.DataFrame()
#ss stands for subset

o=10
#ncyc = 30
#ncycthresh = 10
#pthresh = 0.01
yorkn = 100
#diffthresh = 0.01
#ethresh = 0.50


#for p in np.arange(0,2):
for p in np.arange(len(ploidies)):
    #for e in np.arange(0,2):
    for e in np.arange(len(envts)):
        print(ploidies[p]+'_'+envts[e])

        #for ss in np.arange(0,2):
        for ss in np.arange(len(ssnums)):
            megaremoverv2 = pd.DataFrame()
            
            myssnum = ssnums[ss]
            print(str(myssnum))

            # First, import a list of genotypes to predict
            data1 = pd.read_csv('20211123_subsets-output/subsets/'+ploidies[p]+'_'+envts[e]+'_subset_'+str(myssnum)+'.txt',
                                sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],
                                skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
            # represented multiple times.
            glist = list(OrderedDict.fromkeys(data1['genotype']))

            data2 = pd.DataFrame()

            for g in np.arange(len(glist)):
                tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                if len(tempg) == 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
                    data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
                elif len(tempg) > 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
                    #my_svar = statistics.variance(tempg['s'])
                    my_svar = 0
                    mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
                    data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                data2.at[g,'s_pred'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_Alex prediction']

            data1 = data2

            # Calculate the difference between observed and predicted, such that s_obs = s_pred + diff
            data1['opdiff'] = data1['s'] - data1['s_pred']

            # Create a column for each locus
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
                data1.loc[(data1[floci[l]] == 0),floci[l]] = -1

            locihere = []
            for l in np.arange(len(floci)):
                if len(data1.loc[data1[floci[l]] == 1]) > 0:
                    locihere = locihere + [floci[l]]

            # Now import a set of coefficients
            data0 = pd.read_csv('20211123_subsets-output/subsets/'+ploidies[p]+'_'+envts[e]+'_subset_'+str(myssnum)+'.txt',
                                                 sep='\t',names=['todelete','genotype',ploidies[p]+'_'+envts[e]+'_term','na'],skiprows=2,skip_blank_lines=False)

            data0 = data0.loc[data0.loc[(data0['genotype'].isnull())].index.tolist()[0]+1:,:]

            data0['genotype'] = data0['genotype'].astype(int).astype(str).str.zfill(len(locihere))

            data0 = data0.drop(columns=['todelete','na']).reset_index(drop=True)

            for l in np.arange(len(locihere)):
                data0[locihere[l]] = data0.loc[:,'genotype'].str[l].astype(int)
                data1['without_'+locihere[l]] = data1.loc[:,'genotype'].str[:floci.index(locihere[l])] + data1.loc[:,'genotype'].str[floci.index(locihere[l])+1:]

            data0['numMut'] = data0[locihere].sum(axis=1)

            # Add a "baseline" column for whether these terms are added or subtracted in the WT
            for i in np.arange(len(data0)):
                if data0.loc[i,'numMut'] % 2 == 1:
                    data0.at[i,'baseline'] = -1
                else:
                    data0.at[i,'baseline'] = 1

            #for l in np.arange(0,1):
            for l in np.arange(len(locihere)):

                # Want to start with "all-epistasis" observed fitnesses in data1
                # Get b_obs, TLS_b_obs, TLS_1 (and intercept values)
                tab0 = data1.loc[(data1[locihere[l]] == -1)].copy(deep=True).reset_index(drop=True)
                tab0 = tab0[['genotype','s','stderr(s)','without_'+locihere[l]]]
                tab1 = data1.loc[(data1[locihere[l]] == 1)].copy(deep=True).reset_index(drop=True)
                tab1 = tab1[['genotype','s','stderr(s)','without_'+locihere[l]]]

                templ = pd.merge(tab0,tab1,how='inner',on='without_'+locihere[l])

                for i in np.arange(len(locihere)):
                    templ[locihere[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

                # York regression to get sum of total least squares deviations, S
                myreg = yorkreg_nocorr(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
                myreg1 = york_slope1(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)

                # Create a subtable - we'll append these together into megaremoverv2
                locusremover = pd.DataFrame()

                # wrap up the below, haven't edited yet...
                locusremover.at[0,'num term add'] = 0
                locusremover.at[0,'term added'] = 'na'
                locusremover.at[0,'term order'] = 'na'
                locusremover.at[0,'coefficient'] = 'na'
                locusremover.at[0,'inferred_b'] = myreg[0]
                locusremover.at[0,'inferred_a'] = myreg[1]
                locusremover.at[0,'inferred_S'] = myreg[3]
                locusremover.at[0,'main_b'] = myreg[0]
                locusremover.at[0,'main_a'] = myreg[1]
                locusremover.at[0,'main_S'] = myreg[3]
                locusremover.at[0,'N'] = len(templ)
                locusremover.at[0,'1_a'] = myreg1[0]
                locusremover.at[0,'1_S'] = myreg1[1]

                # Now remove all terms
                builder = data0.copy(deep=True)
                builder.at[builder.loc[(builder['numMut'] > 1)&(builder[locihere[l]] == 1)].index,ploidies[p]+'_'+envts[e]+'_term'] = 0


                # add coefficient info in locusadder
                locusremover.at[1,'num term add'] = len(data0.loc[(data0['numMut'] > 1)&(data0[locihere[l]] == 1)])
                locusremover.at[1,'term added'] = 'all' 
                locusremover.at[1,'term order'] = 'na'
                locusremover.at[1,'coefficient'] = 'na'

                # Estimate the new predicted value for each genotype
                for g in np.arange(len(data1)):
                    temp = builder.copy(deep=True)
                    for t in np.arange(len(temp)):
                        temprowlist = []
                        for locus in np.arange(len(locihere)):
                            temprowlist = temprowlist + [temp.loc[t,locihere[locus]]*data1.loc[g,locihere[locus]]]
                        #remove zeros
                        temprowlist = [value for value in temprowlist if value != 0]
                        # find product
                        firstprod = np.prod(temprowlist)
                        # subtract out baseline
                        firstprod_lessbaseline = firstprod - temp.loc[t,'baseline']
                        # multiply by term's value
                        mytermval = firstprod_lessbaseline * temp.loc[t,ploidies[p]+'_'+envts[e]+'_term']
                        temp.at[t,'tosum'] = mytermval
                    totalsum = temp['tosum'].sum()
                    myintercept = data1.loc[(data1['genotype'] == '0000000000'),'s_pred'].values[0]
                    data1.at[g,'new_pred'] = totalsum + myintercept
                data1['new_obs'] = data1['new_pred'] + data1['opdiff']

                # For the new predictions, get the values I want
                tab0 = data1.loc[(data1[locihere[l]] == -1)].copy(deep=True).reset_index(drop=True)
                tab0 = tab0[['genotype','new_obs','stderr(s)','without_'+locihere[l]]]
                tab1 = data1.loc[(data1[locihere[l]] == 1)].copy(deep=True).reset_index(drop=True)
                tab1 = tab1[['genotype','new_obs','stderr(s)','without_'+locihere[l]]]

                templ = pd.merge(tab0,tab1,how='inner',on='without_'+locihere[l])

                my_x = templ['new_obs_x']
                my_y = templ['new_obs_y']
                my_x_err = templ['stderr(s)_x']
                my_y_err = templ['stderr(s)_y']

                myregnew = yorkreg_nocorr(my_x,my_y,my_x_err,my_y_err,yorkn)
                locusremover.at[1,'inferred_b'] = myregnew[0]
                locusremover.at[1,'inferred_a'] = myregnew[1]
                locusremover.at[1,'inferred_S'] = myregnew[3]

                myreg_main = york_slopeanyb(my_x,my_y,my_x_err,my_y_err,yorkn,myreg[0])
                locusremover.at[1,'main_b'] = myreg[0]
                locusremover.at[1,'main_a'] = myreg_main[0]
                locusremover.at[1,'main_S'] = myreg_main[1]

                myreg_1 = york_slope1(my_x,my_y,my_x_err,my_y_err,yorkn)
                locusremover.at[1,'1_a'] = myreg_1[0]
                locusremover.at[1,'1_S'] = myreg_1[1]

                locusremover.at[1,'N'] = len(templ)

                #print(abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum())

                #if n >= ncycthresh:
                #    if (abs(locusremover.loc[1:,'coefficient']).sum()/data0adder['abs coefficient'].sum()) > ethresh:
                #        break

                locusremover.at[1:,'term added'] = 'na'
                locusremover.insert(0,'main locus', locihere[l])
                locusremover.insert(0,'envt',envts[e])
                locusremover.insert(0,'ploidy',ploidies[p])

                megaremoverv2 = megaremoverv2.append(locusremover)
                print(locihere[l])
            megaremoverv2.insert(0,'subset',myssnum)
            megaremoverss = megaremoverss.append(megaremoverv2)
                
        #export_csv = megaremoverss.to_csv(r'20211218_megaremoverss_allremoved-checkpoint'+ploidies[p]+'_'+envts[e]+'.csv',index=True,header=True) 

mystop = time.perf_counter()
elapsed = mystop-mystart
print(str(elapsed)+' seconds to run')

In [None]:
# plotting things properly, subplots style, for publication

# Make the plot of subsets for the SI

fct_thresh = 0.9

ssbank = pd.read_csv('20211218_megaremoverss_allremoved-hap_37C-4NQO.csv').drop(columns=['Unnamed: 0'])

# I want to know which mutations and are or are not FCTs for plotting purposes
getalls = pd.read_csv('20210712_megaremoverv2_haphom.csv').drop(columns=['Unnamed: 0'])

totalbank = pd.DataFrame()

for p in np.arange(0,1):
    for e in np.arange(0,2):
        temp = getalls.loc[(getalls['ploidy'] == ploidies[p])&(getalls['envt'] == envts[e])&(getalls['num term add'] == 0)]
        totalbank = totalbank.append(temp)

totalbank = totalbank.reset_index(drop=True)

for i in np.arange(len(totalbank)):
    if totalbank.loc[i,'inferred_b'] <= 1:
        totalbank.at[i,'inferred_b_-1to1'] = totalbank.loc[i,'inferred_b']
    else:
        totalbank.at[i,'inferred_b_-1to1'] = 1/totalbank.loc[i,'inferred_b']



# add the num_mut column
nm = []
for i in np.arange(len(ssnums)):
    nm = nm + [len(ssbank.loc[ssbank['subset'] == ssnums[i]])/4]
nmadder = pd.DataFrame()
nmadder['subset'] = ssnums
nmadder['num_mut'] = nm

ssbank = pd.merge(ssbank,nmadder,on='subset',how='left')
    

for p in np.arange(0,1):
    for e in np.arange(0,2):

        mr2ss = ssbank.copy(deep=True).loc[ssbank['envt'] == envts[e]].reset_index(drop=True)
        
        # Create a main_b_-1to1 column and inferred_b_-1to1 column
        # This will help in partitioning the data to look just at those that have a FCT by our criteria
        for i in np.arange(len(mr2ss)):
            if mr2ss.loc[i,'main_b'] <= 1:
                mr2ss.at[i,'main_b_-1to1'] = mr2ss.loc[i,'main_b']
            else:
                mr2ss.at[i,'main_b_-1to1'] = 1/mr2ss.loc[i,'main_b']

        for i in np.arange(len(mr2ss)):
            if mr2ss.loc[i,'inferred_b'] <= 1:
                mr2ss.at[i,'inferred_b_-1to1'] = mr2ss.loc[i,'inferred_b']
            else:
                mr2ss.at[i,'inferred_b_-1to1'] = 1/mr2ss.loc[i,'inferred_b']

        # Take ratios of the S values
        mr2ss['main/inferred'] = mr2ss['main_S']/mr2ss['inferred_S']
        mr2ss['1/inferred'] = mr2ss['1_S']/mr2ss['inferred_S']
        mr2ss['1/main'] = mr2ss['1_S']/mr2ss['main_S']
        
        # Create table to analyze decipherability

        detabss = pd.DataFrame()

        # first, start with all combinations of subsets and loci
        ssl = []
        for i in np.arange(len(mr2ss)):
            ssl = ssl + [[mr2ss.loc[i,'subset'],mr2ss.loc[i,'main locus']]]
        ssl2 = []
        [ssl2.append(x) for x in ssl if x not in ssl2]
        ssl = ssl2

        myind = 0
        for i in np.arange(len(ssl)):
            temp = mr2ss.loc[(mr2ss['subset'] == ssl[i][0])&(mr2ss['main locus'] == ssl[i][1])].reset_index(drop=True)

            detabss.at[myind,'subset'] = ssl[i][0]
            detabss.at[myind,'ploidy'] = temp.loc[0,'ploidy']
            detabss.at[myind,'envt'] =  temp.loc[0,'envt']
            detabss.at[myind,'main locus'] = ssl[i][1]

            detabss.at[myind,'N'] = temp.loc[0,'N']
            detabss.at[myind,'num_mut'] = temp.loc[0,'num_mut']
            detabss.at[myind,'inferred_b_-1to1_original'] = temp.loc[0,'inferred_b_-1to1']

            if len(temp) == 1:
                this1main = temp.loc[0,'1/main']
               # thisnumtermadd = temp.loc[0,'num term add']
                thisfinalslope = temp.loc[0,'inferred_b_-1to1']

            else:
                #mymaxind = temp.iloc[-1]['num term add']
                this1main = temp.loc[1,'1/main']
                #thisnumtermadd = mymaxind
                thisfinalslope = temp.loc[1,'inferred_b_-1to1']


            detabss.at[myind,'1/main_final'] = this1main
            #detabss.at[myind,'num term add'] = thisnumtermadd
            detabss.at[myind,'inferred_b_-1to1_final'] = thisfinalslope


            myind = myind+1

        # export
        export_csv = detabss.to_csv(r'20211211_hap_'+envts[e]+'_detabss_allrem.csv',index=True,header=True)
        #export
        #export_csv = detabssfc.to_csv(r'20211211_hap_37C_detabssfc.csv',index=True,header=True)


        #now that we have the table, we can do some analysis
        myind=0
        mytrack = pd.DataFrame()
        temp = detabss.loc[(detabss['N'] > 2)&(detabss['inferred_b_-1to1_original'] <= fct_thresh)].copy(deep=True)
        for l in np.arange(len(floci)):
            for n in np.arange(3,11):
                templn = temp.loc[(temp['main locus'] == floci[l])&(temp['num_mut'] == n)].copy(deep=True)
                count1mainlessthan1 = len(templn.loc[templn['1/main_final'] <= 1])
                countall = len(templn)
                mytrack.at[myind,'ploidy'] = ploidies[p]
                mytrack.at[myind,'envt'] = envts[e]
                mytrack.at[myind,'main locus'] = floci[l]
                mytrack.at[myind,'num_mut'] = n
                mytrack.at[myind,'count total'] = countall
                mytrack.at[myind,'count 1/main <= 1'] = count1mainlessthan1
                if len(templn) == 0:
                    mytrack.at[myind,'fraction <= 1'] = np.nan
                else:
                    mytrack.at[myind,'fraction <= 1'] = count1mainlessthan1 / countall
                myind=myind+1
                
        # SET UP FIGURE
        fig,ax = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False,figsize=(5,2),constrained_layout=True)   
        
        fctloci = list(totalbank.copy(deep=True).loc[(totalbank['envt'] == envts[e])&(totalbank['inferred_b_-1to1'] <= fct_thresh)]['main locus'])               
        # now plot the results, just for guys that are FCTs
        for l in np.arange(len(fctloci)):
            templ = mytrack.loc[(mytrack['ploidy'] == ploidies[p])&(mytrack['envt'] == envts[e])&(mytrack['main locus'] == fctloci[l])]
            ax[1].plot(np.arange(3,11),templ['fraction <= 1'])
        ax[1].set_xlabel('subset size')
        #ax[1].set_ylabel('fraction where 1/main <= 1 after removal')
        #ax[1].set_ylabel('\n'.join(wrap('Fraction relative fit ratio, SSE$_{b=1}$ / SSE$_{b=global}$ ≤ 1 when all epistasis removed',30)),labelpad=2)
        ax[1].set_ylabel('\n'.join(wrap('Fraction subsets with SSE$_{b=1}$/SSE$_{b=global}$ ≤ 1, all epistasis removed',36)),labelpad=2)
        #plt.title(ploidies[p]+'_'+envts[e])
        #ax[1].legend(fctloci)
        #plt.show()


        # how many go to 1/main <= 1 with fullco vs non-fullco version of the removal analysis?
        # actually, don't do this analysis now, not top priority
        
        # offset loci, for visibility
        loff = [-0.2,-0.1,0,0.1,0.2]

        # actual next step - look at 1/main final instead of fraction ≤ 1
        temp = detabss.loc[(detabss['N'] > 2)&(detabss['inferred_b_-1to1_original'] <= fct_thresh)].copy(deep=True)
        for l in np.arange(len(fctloci)):
            templ = temp.loc[(temp['main locus'] == fctloci[l])]
            ax[0].scatter(templ['num_mut']+loff[l],templ['1/main_final'],alpha=0.7,s=2)
            #myreg = linregress(templ['num_mut'],templ['1/main_final'])
            #plt.plot(np.arange(3,10),np.arange(3,10)*myreg.slope+myreg.intercept)
            mymeds = []
            for n in np.arange(3,11):
                templn = templ.loc[(templ['num_mut'] == n)]
                mymeds = mymeds + [np.median(templn['1/main_final'])]
            ax[0].plot(np.arange(3,11)+loff[l],mymeds)
        ax[0].set_xlabel('subset size')
        #ax[0].set_ylabel('\n'.join(wrap('Relative fit ratio, SSE$_{b=1}$ / SSE$_{b=global}$, all epistasis removed',30)),labelpad=2)
        ax[0].set_ylabel('\n'.join(wrap('Final SSE$_{b=1}$ / SSE$_{b=global}$, all epistasis removed',40)),labelpad=2)
        #ax[0].set_ylabel('final SSE b=1 / b=global')
        # do custom y scales
        if envts[e] == '37C':
            ax[0].set_ylim(-0.5,10) #cuts off some points
        else:
            ax[0].set_ylim(-0.5,15) #cuts off some points
        #plt.ylim(-0.5,10) #I know this cuts off some points
        #plt.title(ploidies[p]+'_'+envts[e])
        leg = ax[0].legend(fctloci)
        leg._legend_box.align = "left"
        leg.set_title('Locus')
        ax[0].axhline(y=1,lw=0.5,color='k',zorder=0)
        
        #plt.savefig("msfigs/SIfigs/residual-ratio_"+ploidies[p]+'_'+envts[e]+".pdf",bbox_inches='tight',dpi=300)
        
        plt.show()


In [None]:
# now just do the dumb thing of getting all the global slopes for all the loci in all the subsets

bglobalss = pd.DataFrame()

o=10

yorkn = 100

myind=0

for ss in np.arange(len(ssnums)):
#for ss in np.arange(0,1):
    myssnum = ssnums[ss]
    
    print(str(myssnum))

    for p in np.arange(0,1):
    #for p in np.arange(len(ploidies)):
        for e in np.array([1,2,4]):
        #for e in np.arange(len(envts)):

            #print(ploidies[p]+'_'+envts[e])

            # First, import a list of genotypes to predict
            data1 = pd.read_csv('20211123_subsets-output/subsets/'+ploidies[p]+'_'+envts[e]+'_subset_'+str(myssnum)+'.txt',
                                sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],
                                skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Average genotypes, propagating error. Reason for this is to get rid of weird artifacts from having same genotype
            # represented multiple times.
            glist = list(OrderedDict.fromkeys(data1['genotype']))

            data2 = pd.DataFrame()

            for g in np.arange(len(glist)):
                tempg = data1.loc[(data1['genotype'] == glist[g])].reset_index(drop=True)
                if len(tempg) == 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs']
                    data2.at[g,'stderr(s)'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_s-obs-err']
                elif len(tempg) > 1:
                    data2.at[g,'genotype'] = glist[g]
                    data2.at[g,'s'] = tempg[ploidies[p]+'_'+envts[e]+'_s-obs'].mean()
                    #my_svar = statistics.variance(tempg['s'])
                    my_svar = 0
                    mymean_stderr = np.mean(tempg[ploidies[p]+'_'+envts[e]+'_s-obs-err']**2)
                    data2.at[g,'stderr(s)'] = np.sqrt(my_svar+mymean_stderr)
                data2.at[g,'s_pred'] = tempg.loc[0,ploidies[p]+'_'+envts[e]+'_Alex prediction']

            data1 = data2

            ## Calculate the difference between observed and predicted, such that s_obs = s_pred + diff
            #data1['opdiff'] = data1['s'] - data1['s_pred']

            # Create a column for each locus
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)
                data1.loc[(data1[floci[l]] == 0),floci[l]] = -1

            locihere = []
            for l in np.arange(len(floci)):
                if len(data1.loc[data1[floci[l]] == 1]) > 0:
                    locihere = locihere + [floci[l]]

            for l in np.arange(len(locihere)):
                    data1['without_'+locihere[l]] = data1.loc[:,'genotype'].str[:floci.index(locihere[l])] + data1.loc[:,'genotype'].str[floci.index(locihere[l])+1:]
            
            #for l in np.arange(0,1):
            for l in np.arange(len(locihere)):

                # Want to start with "all-epistasis" observed fitnesses in data1
                # Get b_obs, TLS_b_obs, TLS_1 (and intercept values)
                tab0 = data1.loc[(data1[locihere[l]] == -1)].copy(deep=True).reset_index(drop=True)
                tab0 = tab0[['genotype','s','stderr(s)','without_'+locihere[l]]]
                tab1 = data1.loc[(data1[locihere[l]] == 1)].copy(deep=True).reset_index(drop=True)
                tab1 = tab1[['genotype','s','stderr(s)','without_'+locihere[l]]]

                templ = pd.merge(tab0,tab1,how='inner',on='without_'+locihere[l])

                for i in np.arange(len(locihere)):
                    templ[locihere[i]] = templ.loc[:,'genotype_x'].str[i].astype(int)

                # York regression to get sum of total least squares deviations, S
                myreg = yorkreg_nocorr(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)
                myreg1 = york_slope1(templ['s_x'],templ['s_y'],templ['stderr(s)_x'],templ['stderr(s)_y'],yorkn)

                #TEMPORARY
                ## plot
                #plt.errorbar(templ['s_x'],templ['s_y'],xerr=templ['stderr(s)_x'],yerr=templ['stderr(s)_y'],ls='none')
                #ys = plt.gca().get_ylim()
                #xs = plt.gca().get_xlim()
                #lower = min(xs[0],ys[0])
                #upper = max(xs[1],ys[1])
                #plt.plot(np.linspace(lower,upper),np.linspace(lower,upper),color='k')
                #plt.plot(np.linspace(lower,upper),np.linspace(lower,upper)*myreg[0]+myreg[1],color='xkcd:cerulean')
                #plt.plot(np.linspace(lower,upper),np.linspace(lower,upper)+myreg1[0],color='xkcd:goldenrod')
                #plt.xlim(upper,lower)
                #plt.ylim(upper,lower)
                #plt.show()

                bglobalss.at[myind,'subset'] = myssnum
                bglobalss.at[myind,'num_mut'] = len(locihere)
                bglobalss.at[myind,'ploidy'] = ploidies[p]
                bglobalss.at[myind,'envt'] = envts[e]
                bglobalss.at[myind,'main locus'] = locihere[l]
                #bglobalss.at[myind,'num term add'] = 0
                #bglobalss.at[myind,'term added'] = 'na'
                #bglobalss.at[myind,'term order'] = 'na'
                #bglobalss.at[myind,'coefficient'] = 'na'
                bglobalss.at[myind,'inferred_b'] = myreg[0]
                bglobalss.at[myind,'inferred_a'] = myreg[1]
                bglobalss.at[myind,'inferred_S'] = myreg[3]
                #bglobalss.at[myind,'main_b'] = myreg[0]
                #bglobalss.at[myind,'main_a'] = myreg[1]
                #bglobalss.at[myind,'main_S'] = myreg[3]
                bglobalss.at[myind,'N'] = len(templ)
                bglobalss.at[myind,'1_a'] = myreg1[0]
                bglobalss.at[myind,'1_S'] = myreg1[1]
                
                myind = myind+1
                
#export_csv = bglobalss.to_csv(r'20211207_subsets_bglobal-values_hap_3envs.csv',index=True,header=True)

In [None]:
bglobalss = pd.read_csv('20211207_subsets_bglobal-values_hap_37C.csv')
bglobalss = bglobalss.drop(columns=['Unnamed: 0'])

p=0
e=0

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(bglobalss)):
    if bglobalss.loc[i,'inferred_b'] <= 1:
        bglobalss.at[i,'inferred_b_-1to1'] = bglobalss.loc[i,'inferred_b']
    else:
        bglobalss.at[i,'inferred_b_-1to1'] = 1/bglobalss.loc[i,'inferred_b']
    
for i in np.arange(len(bglobalss)):
    if bglobalss.loc[i,'inferred_b'] <= 1:
        bglobalss.at[i,'inferred_b_-1to1'] = bglobalss.loc[i,'inferred_b']
    else:
        bglobalss.at[i,'inferred_b_-1to1'] = 1/bglobalss.loc[i,'inferred_b']

for l in np.arange(len(floci)):
#for l in np.arange(0,1):
    fig,ax = plt.subplots(figsize=(2.5,2.5))
    means = []
    stds = []
    for n in np.arange(3,10):
        tempss = bglobalss.loc[(bglobalss['main locus'] == floci[l])&(bglobalss['num_mut'] == n)]
        ax.scatter([n]*len(tempss),tempss['inferred_b'])
        means = means + [tempss['inferred_b'].mean()]
        stds = stds + [tempss['inferred_b'].std()]
    ax.plot(np.arange(3,10),means,color='k')
    ax.plot(np.arange(3,10),np.add(means,stds),color='xkcd:grey')
    ax.plot(np.arange(3,10),np.subtract(means,stds),color='xkcd:grey')
    ax.set_xlabel('subset size')
    ax.set_ylabel('global b')
    ax.axhline(y=1,zorder=0,color='xkcd:goldenrod',lw=0.5)
    fig.text(0,1,ploidies[p]+'_'+envts[e]+'_'+floci[l])
    plt.show()
#this is how it looks with all the data


# Do same plots, except for a random set of 9 subsets across the board
for l in np.arange(len(floci)):
    fig,ax = plt.subplots(figsize=(2.5,2.5))
    means = []
    stds = []
    for n in np.arange(3,10):
        temp = bglobalss.loc[(bglobalss['num_mut'] == n)&(bglobalss['main locus'] == floci[l])].copy(deep=True)
        tempss = list(OrderedDict.fromkeys(list(temp['subset'])))
        my9 = sample(tempss,9)
        tempss = temp[temp['subset'].isin(my9)]
        ax.scatter([n]*len(tempss),tempss['inferred_b'])
        means = means + [tempss['inferred_b'].mean()]
        stds = stds + [tempss['inferred_b'].std()]
        print(my9)
    ax.plot(np.arange(3,10),means,color='k')
    ax.plot(np.arange(3,10),np.add(means,stds),color='xkcd:grey')
    ax.plot(np.arange(3,10),np.subtract(means,stds),color='xkcd:grey')
    ax.set_xlabel('subset size')
    ax.set_ylabel('global b')
    ax.axhline(y=1,zorder=0,color='xkcd:goldenrod')
    fig.text(0,1,ploidies[p]+'_'+envts[e]+'_'+floci[l])
    plt.show()

# Now we want to see what the standard deviations are like across the board
iterations = 50

mlist = []
slist = []
for n in np.arange(3,10):
    mlist = mlist + ['mean_'+str(n)]
    slist = slist + ['std_'+str(n)]
clist = mlist + slist

locusbank = pd.DataFrame()


for l in np.arange(len(floci)):
    stdbank = pd.DataFrame(columns=clist)
    for i in np.arange(iterations):
        means = []
        stds = []
        for n in np.arange(3,10):
            temp = bglobalss.loc[(bglobalss['num_mut'] == n)&(bglobalss['main locus'] == floci[l])].copy(deep=True)
            tempss = list(OrderedDict.fromkeys(list(temp['subset'])))
            my9 = sample(tempss,9)
            tempss = temp[temp['subset'].isin(my9)]
            ax.scatter([n]*len(tempss),tempss['inferred_b'])
            means = means + [tempss['inferred_b'].mean()]
            stds = stds + [tempss['inferred_b'].std()]
        appenders = means+stds
        appendage = pd.Series(appenders,index=stdbank.columns)
        stdbank = stdbank.append(appendage, ignore_index=True)

    stdbankjuststds = stdbank[slist].copy(deep=True)
    mystds = stdbankjuststds.mean()
    locusbank = locusbank.append(mystds,ignore_index=True)

locusbank.insert(0,'locus',floci)

# plot average standard deviations for all loci, across all subset numbers
for l in np.arange(len(floci)):
    plt.plot(np.arange(3,10),locusbank.iloc[l,1:])
    plt.xlabel('subset size')
    plt.ylabel('average standard deviation for N = 9')
plt.ylim(-0.5,2)
plt.legend(floci)
plt.show()

# do same plot but just for neutral loci
myneutrals = ['FAS1','NCS2','SCH9']

for l in np.arange(len(floci)):
    if floci[l] in myneutrals:
        plt.plot(np.arange(3,10),locusbank.iloc[l,1:])
        plt.xlabel('subset size')
        plt.ylabel('average standard deviation for N = 9')
plt.legend(myneutrals)
plt.show()


# do same plot but just for neutral loci
myneutrals = ['FAS1','NCS2','PMA1','SCH9','RPI1','AKL1']

for l in np.arange(len(floci)):
    if floci[l] in myneutrals:
        plt.plot(np.arange(3,10),locusbank.iloc[l,1:])
        plt.xlabel('subset size')
        plt.ylabel('average standard deviation for N = 9')
plt.legend(myneutrals)
plt.show()


In [None]:
bglobalss = pd.read_csv('20211207_subsets_bglobal-values_hap_3envs.csv')
bglobalss = bglobalss.drop(columns=['Unnamed: 0'])

p = 0

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(bglobalss)):
    if bglobalss.loc[i,'inferred_b'] <= 1:
        bglobalss.at[i,'inferred_b_-1to1'] = bglobalss.loc[i,'inferred_b']
    else:
        bglobalss.at[i,'inferred_b_-1to1'] = 1/bglobalss.loc[i,'inferred_b']
    
for i in np.arange(len(bglobalss)):
    if bglobalss.loc[i,'inferred_b'] <= 1:
        bglobalss.at[i,'inferred_b_-1to1'] = bglobalss.loc[i,'inferred_b']
    else:
        bglobalss.at[i,'inferred_b_-1to1'] = 1/bglobalss.loc[i,'inferred_b']

for e in np.array([1,2,4]):
    for l in np.arange(len(floci)):
    #for l in np.arange(0,1):
        fig,ax = plt.subplots(figsize=(2.5,2.5))
        means = []
        stds = []
        for n in np.arange(3,10):
            tempss = bglobalss.loc[(bglobalss['main locus'] == floci[l])&(bglobalss['num_mut'] == n)&(bglobalss['envt'] == envts[e])]
            ax.scatter([n]*len(tempss),tempss['inferred_b'])
            means = means + [tempss['inferred_b'].mean()]
            stds = stds + [tempss['inferred_b'].std()]
        ax.plot(np.arange(3,10),means,color='k')
        ax.plot(np.arange(3,10),np.add(means,stds),color='xkcd:grey')
        ax.plot(np.arange(3,10),np.subtract(means,stds),color='xkcd:grey')
        ax.set_xlabel('subset size')
        ax.set_ylabel('global b')
        ax.axhline(y=1,zorder=0,color='xkcd:goldenrod',lw=0.5)
        fig.text(0,1,ploidies[p]+'_'+envts[e]+'_'+floci[l])
        plt.show()
    #this is how it looks with all the data


# Do same plots, except for a random set of 9 subsets across the board

for e in np.array([1,2,4]):
    for l in np.arange(len(floci)):
        fig,ax = plt.subplots(figsize=(2.5,2.5))
        means = []
        stds = []
        for n in np.arange(3,10):
            temp = bglobalss.loc[(bglobalss['num_mut'] == n)&(bglobalss['main locus'] == floci[l])&(bglobalss['envt'] == envts[e])].copy(deep=True)
            tempss = list(OrderedDict.fromkeys(list(temp['subset'])))
            my9 = sample(tempss,9)
            tempss = temp[temp['subset'].isin(my9)]
            ax.scatter([n]*len(tempss),tempss['inferred_b'])
            means = means + [tempss['inferred_b'].mean()]
            stds = stds + [tempss['inferred_b'].std()]
        ax.plot(np.arange(3,10),means,color='k')
        ax.plot(np.arange(3,10),np.add(means,stds),color='xkcd:grey')
        ax.plot(np.arange(3,10),np.subtract(means,stds),color='xkcd:grey')
        ax.set_xlabel('subset size')
        ax.set_ylabel('global b')
        ax.axhline(y=1,zorder=0,color='xkcd:goldenrod')
        fig.text(0,1,ploidies[p]+'_'+envts[e]+'_'+floci[l])
        plt.show()

# Now we want to see what the standard deviations are like across the board
iterations = 50

mlist = []
slist = []
for n in np.arange(3,10):
    mlist = mlist + ['mean_'+str(n)]
    slist = slist + ['std_'+str(n)]
clist = mlist + slist


for e in np.array([1,2,4]):
    locusbank = pd.DataFrame()
    for l in np.arange(len(floci)):
        stdbank = pd.DataFrame(columns=clist)
        for i in np.arange(iterations):
            means = []
            stds = []
            for n in np.arange(3,10):
                temp = bglobalss.loc[(bglobalss['num_mut'] == n)&(bglobalss['main locus'] == floci[l])&(bglobalss['envt'] == envts[e])].copy(deep=True)
                tempss = list(OrderedDict.fromkeys(list(temp['subset'])))
                my9 = sample(tempss,9)
                tempss = temp[temp['subset'].isin(my9)]
                ax.scatter([n]*len(tempss),tempss['inferred_b'])
                means = means + [tempss['inferred_b'].mean()]
                stds = stds + [tempss['inferred_b'].std()]
            appenders = means+stds
            appendage = pd.Series(appenders,index=stdbank.columns)
            stdbank = stdbank.append(appendage, ignore_index=True)

        stdbankjuststds = stdbank[slist].copy(deep=True)
        mystds = stdbankjuststds.mean()
        locusbank = locusbank.append(mystds,ignore_index=True)

    locusbank.insert(0,'locus',floci)

    # plot average standard deviations for all neutral loci, across all subset numbers
    myl = []
    for l in np.arange(len(floci)):
        if mr2.loc[(mr2['ploidy'] == 'hap')&(mr2['envt'] == envts[e])&(mr2['main locus'] == floci[l])&(mr2['num term add'] == 0),'inferred_b_-1to1'].values[0] > 0.95:
            plt.plot(np.arange(3,10),locusbank.iloc[l,1:])
            plt.xlabel('subset size')
            plt.ylabel('average standard deviation for N = 9')
            myl = myl + [floci[l]]
    plt.ylim(-0.5,4)
    fig.text(0,1,ploidies[p]+'_'+envts[e])
    plt.legend(myl)
    plt.show()

# do same plot but just for neutral loci
# first for 4nqo
#myneutrals = ['BUL2','FAS1','MKT1','NCS2','RPI1']

#for e in np.arange(1,2):
#    for l in np.arange(len(floci)):
#        if floci[l] in myneutrals:
#            plt.plot(np.arange(3,10),locusbank.iloc[l,1:])
#            plt.xlabel('subset size')
#            plt.ylabel('average standard deviation for N = 9')
#    plt.legend(myneutrals)
#    plt.show()


In [11]:
## Analyze the megaremoverv2 variants

# this has been set up so we will get all the guys, not just 1 in 5!
mr2ss1 = pd.read_csv('20210712_megaremoverss_1in5.csv')
mr2ss1 = mr2ss1.drop(columns=['Unnamed: 0'])

mr2ss4 = pd.read_csv('20210712_megaremoverss_4in5.csv')
mr2ss4 = mr2ss4.drop(columns=['Unnamed: 0'])

mr2ss = mr2ss1.append(mr2ss4).reset_index(drop=True)

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2ss)):
    if mr2ss.loc[i,'main_b'] <= 1:
        mr2ss.at[i,'main_b_-1to1'] = mr2ss.loc[i,'main_b']
    else:
        mr2ss.at[i,'main_b_-1to1'] = 1/mr2ss.loc[i,'main_b']
    
for i in np.arange(len(mr2ss)):
    if mr2ss.loc[i,'inferred_b'] <= 1:
        mr2ss.at[i,'inferred_b_-1to1'] = mr2ss.loc[i,'inferred_b']
    else:
        mr2ss.at[i,'inferred_b_-1to1'] = 1/mr2ss.loc[i,'inferred_b']

# Take ratios of the S values
mr2ss['main/inferred'] = mr2ss['main_S']/mr2ss['inferred_S']
mr2ss['1/inferred'] = mr2ss['1_S']/mr2ss['inferred_S']
mr2ss['1/main'] = mr2ss['1_S']/mr2ss['main_S']



# For each ploidy-envt-locus, plot ratios
# Start by doing just those below the FCT threshold b, can toggle

fct_thresh = 0.9

#for p in np.arange(2):
#    for e in np.arange(len(envts)):
#        tempfloci = []
#        for l in np.arange(len(floci)):
#            mr2sssub = mr2ss.loc[(mr2ss['ploidy'] == ploidies[p])&(mr2ss['envt'] == envts[e])&(mr2ss['main locus'] == floci[l])].copy(deep=True).reset_index(drop=True)
#            if mr2sssub.loc[0,'main_b_-1to1'] <= fct_thresh:
#                tempfloci = tempfloci + [floci[l]]
#                plt.plot(mr2sssub['num term add'],mr2sssub['1/main'])
#                #plt.plot(mr2sssub['num term add'],mr2sssub['main/inferred'])
#                #plt.plot(mr2sssub['num term add'],mr2sssub['1/inferred'])
#                #plt.title(ploidies[p]+'-'+envts[e]+'_'+floci[l])
#                #plt.axhline(y=1,color='xkcd:grey',zorder=0)
#                #plt.show()
#        plt.title(ploidies[p]+'-'+envts[e])
#        plt.axhline(y=1,color='xkcd:grey',zorder=0)
#        plt.legend(tempfloci)
#        plt.show()


mr2ssfullco = pd.read_csv('20210712_megaremoverssfullco_1in5.csv')
mr2ssfullco = mr2ssfullco.drop(columns=['Unnamed: 0'])

# Create a main_b_-1to1 column and inferred_b_-1to1 column
# This will help in partitioning the data to look just at those that have a FCT by our criteria
for i in np.arange(len(mr2ssfullco)):
    if mr2ssfullco.loc[i,'main_b'] <= 1:
        mr2ssfullco.at[i,'main_b_-1to1'] = mr2ssfullco.loc[i,'main_b']
    else:
        mr2ssfullco.at[i,'main_b_-1to1'] = 1/mr2ssfullco.loc[i,'main_b']
    
for i in np.arange(len(mr2ssfullco)):
    if mr2ssfullco.loc[i,'inferred_b'] <= 1:
        mr2ssfullco.at[i,'inferred_b_-1to1'] = mr2ssfullco.loc[i,'inferred_b']
    else:
        mr2ssfullco.at[i,'inferred_b_-1to1'] = 1/mr2ssfullco.loc[i,'inferred_b']

# Take ratios of the S values
mr2ssfullco['main/inferred'] = mr2ssfullco['main_S']/mr2ssfullco['inferred_S']
mr2ssfullco['1/inferred'] = mr2ssfullco['1_S']/mr2ssfullco['inferred_S']
mr2ssfullco['1/main'] = mr2ssfullco['1_S']/mr2ssfullco['main_S']

# For each ploidy-envt-locus, plot ratios
# Start by doing just those below the FCT threshold b, can toggle

fct_thresh = 0.9

#for p in np.arange(2):
#    for e in np.arange(len(envts)):
#        tempfloci = []
#        for l in np.arange(len(floci)):
#            mr2ssfullcosub = mr2ssfullco.loc[(mr2ssfullco['ploidy'] == ploidies[p])&(mr2ssfullco['envt'] == envts[e])&(mr2ssfullco['main locus'] == floci[l])].copy(deep=True).reset_index(drop=True)
#            if mr2ssfullcosub.loc[0,'main_b_-1to1'] <= fct_thresh:
#                tempfloci = tempfloci + [floci[l]]
#                plt.plot(mr2ssfullcosub['num term add'],mr2ssfullcosub['1/main'])
#                #plt.plot(mr2ssfullcosub['num term add'],mr2ssfullcosub['main/inferred'])
#                #plt.plot(mr2ssfullcosub['num term add'],mr2ssfullcosub['1/inferred'])
#                #plt.title(ploidies[p]+'-'+envts[e]+'_'+floci[l])
#                #plt.axhline(y=1,color='xkcd:grey',zorder=0)
#                #plt.show()
#        plt.title(ploidies[p]+'-'+envts[e])
#        plt.axhline(y=1,color='xkcd:grey',zorder=0)
#        plt.legend(tempfloci)
#        plt.show()


In [12]:
# Forgot to get subset sizes, extract now
sssize = pd.DataFrame()
for ss in np.arange(len(ssnums)):
    
    myssnum = ssnums[ss]
    
    for p in np.arange(0,1):
        for e in np.arange(0,1):
            data1 = pd.read_csv('20211123_subsets-output/subsets/'+ploidies[p]+'_'+envts[e]+'_subset_'+str(myssnum)+'.txt',
                                        sep='\t',names=['genotype',ploidies[p]+'_'+envts[e]+'_Alex prediction',ploidies[p]+'_'+envts[e]+'_s-obs',ploidies[p]+'_'+envts[e]+'_s-obs-err'],
                                        skiprows=2,skip_blank_lines=False)

            # Chuck out the bottom lines that estimate fitness effects of specific combos of mutations
            data1 = data1.loc[:data1.loc[(data1['genotype'].isnull())].index.tolist()[0]-1,:]

            # Binary style for genotype
            data1['genotype'] = data1['genotype'].astype(int).astype(str).str.zfill(10)

            # Create a column for each locus
            for l in np.arange(len(floci)):
                data1[floci[l]] = data1.loc[:,'genotype'].str[l].astype(int)

            locihere = []
            for l in np.arange(len(floci)):
                if len(data1.loc[data1[floci[l]] == 1]) > 0:
                    locihere = locihere + [floci[l]]
            
            temp = pd.DataFrame()
            temp.at[0,'subset'] = myssnum
            temp.at[0,'num_mut'] = len(locihere)
            sssize = sssize.append(temp)
sssize = sssize.reset_index(drop=True)
mr2ssfullco = pd.merge(mr2ssfullco,sssize,on='subset',how='left')
mr2ss = pd.merge(mr2ss,sssize,on='subset',how='left')

# export
export_csv = mr2ss.to_csv(r'20211211_hap_37C_mr2ss.csv',index=True,header=True)
#export
export_csv = mr2ssfullco.to_csv(r'20211211_hap_37C_mr2ssfullco.csv',index=True,header=True)

In [None]:
# Create table to analyze decipherability

detabss = pd.DataFrame()
detabssfc = pd.DataFrame()

# first, start with all combinations of subsets and loci
ssl = []
for i in np.arange(len(mr2ss)):
    ssl = ssl + [[mr2ss.loc[i,'subset'],mr2ss.loc[i,'main locus']]]
ssl2 = []
[ssl2.append(x) for x in ssl if x not in ssl2]
ssl = ssl2

myind = 0
for i in np.arange(len(ssl)):
    temp = mr2ss.loc[(mr2ss['subset'] == ssl[i][0])&(mr2ss['main locus'] == ssl[i][1])].reset_index(drop=True)
    #tempfc = mr2ssfullco.loc[(mr2ssfullco['subset'] == ssl[i][0])&(mr2ssfullco['main locus'] == ssl[i][1])].reset_index(drop=True)
    
    detabss.at[myind,'subset'] = ssl[i][0]
    detabss.at[myind,'ploidy'] = temp.loc[0,'ploidy']
    detabss.at[myind,'envt'] =  temp.loc[0,'envt']
    detabss.at[myind,'main locus'] = ssl[i][1]
    
    detabss.at[myind,'N'] = temp.loc[0,'N']
    detabss.at[myind,'num_mut'] = temp.loc[0,'num_mut']
    detabss.at[myind,'inferred_b_-1to1_original'] = temp.loc[0,'inferred_b_-1to1']
    
    if len(temp) == 1:
        this1main = temp.loc[0,'1/main']
        thisnumtermadd = temp.loc[0,'num term add']
        thisfinalslope = temp.loc[0,'inferred_b_-1to1']
   #     this1mainfc = tempfc.loc[0,'1/main']
   #     thisnumtermaddfc = tempfc.loc[0,'num term add']
   #     thisfinalslopefc = tempfc.loc[0,'inferred_b_-1to1']
    else:
        mymaxind = temp.iloc[-1]['num term add']
        this1main = temp.loc[int(mymaxind),'1/main']
        thisnumtermadd = mymaxind
        thisfinalslope = temp.loc[int(mymaxind),'inferred_b_-1to1']
   #     mymaxindfc = tempfc.iloc[-1]['num term add']
   #     this1mainfc = tempfc.loc[int(mymaxindfc),'1/main']
   #     thisnumtermaddfc = mymaxindfc
   #     thisfinalslopefc = tempfc.loc[int(mymaxindfc),'inferred_b_-1to1']
    
    detabss.at[myind,'1/main_final'] = this1main
    detabss.at[myind,'num term add'] = thisnumtermadd
    detabss.at[myind,'inferred_b_-1to1_final'] = thisfinalslope
    
    #detabssfc.at[myind,'subset'] = ssl[i][0]
    #detabssfc.at[myind,'ploidy'] = tempfc.loc[0,'ploidy']
    #detabssfc.at[myind,'envt'] =  tempfc.loc[0,'envt']
    #detabssfc.at[myind,'main locus'] = ssl[i][1]
    
    #detabssfc.at[myind,'N'] = tempfc.loc[0,'N']
    #detabssfc.at[myind,'num_mut'] = tempfc.loc[0,'num_mut']
    #detabssfc.at[myind,'inferred_b_-1to1_original'] = tempfc.loc[0,'inferred_b_-1to1']
    
    #detabssfc.at[myind,'1/main_final'] = this1mainfc
    #detabssfc.at[myind,'num term add'] = thisnumtermaddfc
    #detabssfc.at[myind,'inferred_b_-1to1_final'] = thisfinalslopefc
    
    myind = myind+1
        
# export
export_csv = detabss.to_csv(r'20211211_hap_37C_detabss.csv',index=True,header=True)
#export
#export_csv = detabssfc.to_csv(r'20211211_hap_37C_detabssfc.csv',index=True,header=True)


#now that we have the table, we can do some analysis
myind=0
mytrack = pd.DataFrame()
temp = detabss.loc[(detabss['N'] > 2)&(detabss['inferred_b_-1to1_original'] <= 0.9)].copy(deep=True)
for p in np.arange(0,1):
    for e in np.arange(0,1):
        for l in np.arange(len(floci)):
            for n in np.arange(3,10):
                templn = temp.loc[(temp['main locus'] == floci[l])&(temp['num_mut'] == n)].copy(deep=True)
                count1mainlessthan1 = len(templn.loc[templn['1/main_final'] <= 1])
                countall = len(templn)
                mytrack.at[myind,'ploidy'] = ploidies[p]
                mytrack.at[myind,'envt'] = envts[e]
                mytrack.at[myind,'main locus'] = floci[l]
                mytrack.at[myind,'num_mut'] = n
                mytrack.at[myind,'count total'] = countall
                mytrack.at[myind,'count 1/main <= 1'] = count1mainlessthan1
                if len(templn) == 0:
                    mytrack.at[myind,'fraction <= 1'] = np.nan
                else:
                    mytrack.at[myind,'fraction <= 1'] = count1mainlessthan1 / countall
                myind=myind+1
                
# now plot the results
for p in np.arange(0,1):
    for e in np.arange(0,1):
        for l in np.arange(len(floci)):
            templ = mytrack.loc[(mytrack['ploidy'] == ploidies[p])&(mytrack['envt'] == envts[e])&(mytrack['main locus'] == floci[l])]
            plt.plot(np.arange(3,10),templ['fraction <= 1'])
        plt.xlabel('subset size')
        plt.ylabel('fraction where 1/main <= 1 after removal')
        plt.title(ploidies[p]+'_'+envts[e])
        plt.legend(floci)
    plt.show()

# do for fullco now

#now that we have the table, we can do some analysis
myind=0
mytrackfc = pd.DataFrame()
#temp = detabssfc.loc[(detabssfc['N'] > 2)&(detabssfc['inferred_b_-1to1_original'] <= 0.9)].copy(deep=True)
#for p in np.arange(0,1):
#    for e in np.arange(0,1):
#        for l in np.arange(len(floci)):
#            for n in np.arange(3,10):
#                templn = temp.loc[(temp['main locus'] == floci[l])&(temp['num_mut'] == n)].copy(deep=True)
#                count1mainlessthan1 = len(templn.loc[templn['1/main_final'] <= 1])
#                countall = len(templn)
#                mytrackfc.at[myind,'ploidy'] = ploidies[p]
#                mytrackfc.at[myind,'envt'] = envts[e]
#                mytrackfc.at[myind,'main locus'] = floci[l]
#                mytrackfc.at[myind,'num_mut'] = n
#                mytrackfc.at[myind,'count total'] = countall
#                mytrackfc.at[myind,'count 1/main <= 1'] = count1mainlessthan1
#                if len(templn) == 0:
#                    mytrackfc.at[myind,'fraction <= 1'] = np.nan
#                else:
#                    mytrackfc.at[myind,'fraction <= 1'] = count1mainlessthan1 / countall
#                myind=myind+1
                
# now plot the results
#for p in np.arange(0,1):
#    for e in np.arange(0,1):
#        for l in np.arange(len(floci)):
#            templ = mytrackfc.loc[(mytrackfc['ploidy'] == ploidies[p])&(mytrackfc['envt'] == envts[e])&(mytrackfc['main locus'] == floci[l])]
#            plt.plot(np.arange(3,10),templ['fraction <= 1'])
#        plt.xlabel('subset size')
#        plt.ylabel('fraction where 1/main <= 1 after removal')
#        plt.title(ploidies[p]+'_'+envts[e])
#        plt.legend(floci)
#    plt.show()

    
# how many go to 1/main <= 1 with fullco vs non-fullco version of the removal analysis?
# actually, don't do this analysis now, not top priority

# actual next step - look at 1/main final instead of fraction ≤ 1
temp = detabss.loc[(detabss['N'] > 2)&(detabss['inferred_b_-1to1_original'] <= 0.9)].copy(deep=True)
for p in np.arange(0,1):
    for e in np.arange(0,1):
        for l in np.arange(len(floci)):
            templ = temp.loc[(temp['main locus'] == floci[l])]
            plt.scatter(templ['num_mut'],templ['1/main_final'])
            #myreg = linregress(templ['num_mut'],templ['1/main_final'])
            #plt.plot(np.arange(3,10),np.arange(3,10)*myreg.slope+myreg.intercept)
            mymeds = []
            for n in np.arange(3,10):
                templn = templ.loc[(templ['num_mut'] == n)]
                mymeds = mymeds + [np.median(templn['1/main_final'])]
            plt.plot(np.arange(3,10),mymeds)
        plt.xlabel('subset size')
        plt.ylabel('final SSE b=1 / b=global')
        plt.ylim(-0.5,10) #I know this cuts off some points
        plt.title(ploidies[p]+'_'+envts[e])
        plt.legend(floci)
        plt.axhline(y=1,lw=0.5,color='k',zorder=0)
    plt.show()

# now with full data coefficients
# actual next step - look at 1/main final instead of fraction ≤ 1
#temp = detabssfc.loc[(detabssfc['N'] > 2)&(detabssfc['inferred_b_-1to1_original'] <= 0.9)].copy(deep=True)
#for p in np.arange(0,1):
#    for e in np.arange(0,1):
#        for l in np.arange(len(floci)):
#            templ = temp.loc[(temp['main locus'] == floci[l])]
#            plt.scatter(templ['num_mut'],templ['1/main_final'])
#            #myreg = linregress(templ['num_mut'],templ['1/main_final'])
#            #plt.plot(np.arange(3,10),np.arange(3,10)*myreg.slope+myreg.intercept)
#            mymeds = []
#            for n in np.arange(3,10):
#                templn = templ.loc[(templ['num_mut'] == n)]
#                mymeds = mymeds + [np.median(templn['1/main_final'])]
#            plt.plot(np.arange(3,10),mymeds)
#        plt.xlabel('subset size')
#        plt.ylabel('final SSE b=1 / b=global')
#        plt.ylim(-0.5,10) #I know this cuts off some points
#        plt.title(ploidies[p]+'_'+envts[e])
#        plt.legend(floci)
#        plt.axhline(y=1,lw=0.5,color='k',zorder=0)
#    plt.show()
    
    