In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import glob

In [2]:
# read in the data frame
df = pd.read_parquet("~/Documents/UMontana/Research/YNP/AHQsd/AHQsd_analyses/AHQsd_genotypes.parquet")#, index_col=False)

In [3]:
# replace -1 with NaNs 
df = df.replace(-1,np.NaN)

In [20]:
# only keep individuals who have at least 1000 SNPs 
df_indFilt = df[df.sum(axis=1) > 10000]

In [22]:
# rename columns with positive controls
df_indFilt = df_indFilt.replace(to_replace='AHQsd_5.11C', value="T_plate5_pos_control")
df_indFilt = df_indFilt.replace(to_replace='AHQsd_6.05B', value="N_plate6_pos_control")
df_indFilt = df_indFilt.replace(to_replace='AHQsd_1.08A', value="F1C_plate1_pos_control")
df_indFilt = df_indFilt.replace(to_replace='AHQsd_1.08E', value="N_plate1_pos_control")

In [24]:
# transpose it to look at the sites
tdf = df_indFilt.set_index('indv').T

In [25]:
# only keep sites where the there are at least 100 SNPs (this is conservative)
tdf_siteFilt = tdf[tdf.isna().sum(axis=1) < 200]

In [26]:
# make a positive control spreadsheet to explore which sites are segregating
df_pos_controls = df_indFilt[(df_indFilt['indv'] == "T_plate5_pos_control") |
                            (df_indFilt['indv'] == "N_plate6_pos_control") |
                            (df_indFilt['indv'] == "F1C_plate1_pos_control")|
                            (df_indFilt['indv'] == "N_plate1_pos_control")]

In [27]:
#transpose
df_pos_controls_T = df_pos_controls.set_index('indv').T

In [28]:
#only keep SNPs present in at least 2 of the positive controls
pos_controls_filt = df_pos_controls_T[df_pos_controls_T.isna().sum(axis=1) < 2]

In [11]:
# use apply to make a table with the lists of genotype counts per individual
# filt_df_indvs = tdf_siteFilt.apply(pd.Series.value_counts, axis=0)

In [12]:
# use apply to make a table with the lists of genotype counts per site
# filt_df_sites = tdf_siteFilt.apply(pd.Series.value_counts, axis=1)


In [30]:
# remove sites where the T and N positive controls are heterozygotes 
AHQsd_genotypes_filt = tdf_siteFilt[tdf_siteFilt['T_plate5_pos_control'] != 1.0]
AHQsd_genotypes_filt = tdf_siteFilt[tdf_siteFilt['N_plate6_pos_control'] != 1.0]
#AHQsd_genotypes_filt = tdf_siteFilt[tdf_siteFilt['N_plate1_pos_control'] != 1.0]

In [31]:
AHQsd_genotypes_filt

indv,AHQsd_5.01B,AHQsd_5.01C,AHQsd_5.01D,AHQsd_5.01F,AHQsd_5.01G,AHQsd_5.01H,AHQsd_5.02A,AHQsd_5.02B,AHQsd_5.02C,AHQsd_5.02E,...,AHQsd_3.10H,AHQsd_3.11B,AHQsd_3.11C,AHQsd_3.11D,AHQsd_3.11E,AHQsd_3.11G,AHQsd_3.11H,AHQsd_3.12D,AHQsd_3.12E,AHQsd_3.12G
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14_16128,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
14_16156,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,,0.0,0.0,,,,2.0
14_16421,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,2.0
14_16423,,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,,,0.0,0.0,
14_22971,0.0,0.0,1.0,,2.0,2.0,2.0,2.0,1.0,1.0,...,2.0,2.0,2.0,1.0,2.0,,2.0,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11_25506274,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,,2.0,,2.0,,,2.0,,
11_25506288,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,,2.0,,2.0,,,2.0,,
11_25506289,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,,2.0,,2.0,,,2.0,,
11_25508897,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,,2.0,,2.0,,,2.0,,


In [27]:
AHQsd_genotypes_filt.to_parquet("AHQsd_genotypes_filt.parquet")