In [18]:
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import glob

In [19]:
# read in the data frame
df = pd.read_parquet("~/Documents/UMontana/Research/YNP/AHQsd/AHQsd_analyses/AHQsd_F2_genotypes_raw.parquet")#, index_col=False)

In [20]:
# replace -1 with NaNs 
df = df.replace(-1,np.NaN)

In [21]:
# only keep individuals who have at least 1000 SNPs 
df_indFilt = df[df.sum(axis=1) > 400]

In [22]:
# rename columns with positive controls
df_indFilt = df_indFilt.replace(to_replace='AHQsd_5.11C', value="T_plate5_pos_control")
df_indFilt = df_indFilt.replace(to_replace='AHQsd_6.05B', value="N_plate6_pos_control")
df_indFilt = df_indFilt.replace(to_replace='AHQsd_1.08A', value="F1C_plate1_pos_control")
df_indFilt = df_indFilt.replace(to_replace='AHQsd_1.08E', value="N_plate1_pos_control")

In [23]:
# transpose it to look at the sites
tdf = df_indFilt.set_index('indv').T

In [24]:
# only keep sites where the there are at least 100 SNPs (this is conservative)
tdf_siteFilt = tdf[tdf.isna().sum(axis=1) < 200]

In [25]:
# make a positive control spreadsheet to explore which sites are segregating
df_pos_controls = df_indFilt[(df_indFilt['indv'] == "T_plate5_pos_control") |
                            (df_indFilt['indv'] == "N_plate6_pos_control") |
                            (df_indFilt['indv'] == "F1C_plate1_pos_control")|
                            (df_indFilt['indv'] == "N_plate1_pos_control")]

In [13]:
#transpose
df_pos_controls_T = df_pos_controls.set_index('indv').T

In [14]:
#only keep SNPs present in at least 2 of the positive controls
pos_controls_filt = df_pos_controls_T[df_pos_controls_T.isna().sum(axis=1) < 2]

In [15]:
# use apply to make a table with the lists of genotype counts per individual
filt_df_indvs = tdf_siteFilt.apply(pd.Series.value_counts, axis=0)

In [215]:
# use apply to make a table with the lists of genotype counts per site
filt_df_sites = tdf_siteFilt.apply(pd.Series.value_counts, axis=1)


In [26]:
# remove sites where the T and N positive controls are heterozygotes 
AHQsd_genotypes_filt = tdf_siteFilt[tdf_siteFilt['T_plate5_pos_control'] != 1.0]
AHQsd_genotypes_filt = tdf_siteFilt[tdf_siteFilt['N_plate6_pos_control'] != 1.0]
AHQsd_genotypes_filt = tdf_siteFilt[tdf_siteFilt['N_plate1_pos_control'] != 1.0]

In [27]:
AHQsd_genotypes_filt.to_parquet("AHQsd_genotypes_filt.parquet")