In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import glob

In [2]:
# read in the data frame, drop the unnecessary index row & transpose it so rows are loci and columns are individuals.
df = pd.read_parquet("~/Documents/UMontana/Research/YNP/AHQsd/AHQsd_analyses/AHQsd_genotypes_filt.parquet")

In [3]:
df = df.reset_index()

In [4]:
# label by what linkage group the loci belong to
tmpDF = pd.DataFrame(columns=['chr','site'])
tmpDF[['chr','site']] = df['index'].str.split('_', expand=True)

In [5]:
df['chr'] = tmpDF['chr']

In [6]:
df['site'] = tmpDF['site']

In [7]:
# split them
for chromo in range(1,15):
    lg_chromo = df[df.chr == str(chromo)]
    lg_chromo.to_csv(f'data/lg_chromo_{chromo}.csv',index=False)

In [8]:
tdf = df.T

In [151]:
def list_name_indivs(ds):
    return ds.columns.to_list()[2:323] 

def create_bins(subset_ds):
    # create bins of 18 SNPs each in each linkage group
    bins = [i for i in np.arange(subset_ds.index.min(), subset_ds.index.max(), 25)] + [np.inf]
    return bins

def create_stats_template(ds, chromo_number):
    # create dataframes for the loop to write to. 
    return pd.DataFrame(columns=["indv_name","count","mean",f"num_of_nans_chr{chromo_number}"])

def cut_by_bins(subset_ds, bins):
    return pd.cut(subset_ds.index,bins=bins)

def create_stats_dataframe(small_list, ds, bins, chromo_number):
    stats_dataframe = create_stats_template(ds, chromo_number)
    chromo_number = ds['chr'].iloc[0]
    for col in small_list:
        mdf = ds[['index',col]]
        grouped_df = mdf.groupby(bins)[col].agg(['mean','count'])
        grouped_df[f"num_of_nans_chr{chromo_number}"] = 25 - grouped_df["count"]
        grouped_df["indv_name"] = col
        stats_dataframe  = stats_dataframe.append(grouped_df)
    return stats_dataframe

def remove_high_NaNs(stats_dataframe, chromo_number):
    stats_dataframe.loc[stats_dataframe[f"num_of_nans_chr{chromo_number}"] > 17, 'mean'] = np.nan
    return stats_dataframe

def narrow(filt_stats_dataframe): 
    filt_stats_narrow = filt_stats_dataframe[['indv_name', 'mean']]
    filt_stats_narrow['bin'] = filt_stats_narrow.index
    filt_stats_wide = filt_stats_narrow.pivot_table(index='bin', columns='indv_name', values='mean', aggfunc='first', dropna=False)
    return filt_stats_wide

#def cleanup_cols_names(df):
    #df.columns = df.columns.str.split('processed/').str[1].tolist()   
    #return df

def create_bin_template():
    col_names = ['level_0', 'first','last']
    return pd.DataFrame(columns=col_names)

def create_append_template():
    col_names = df.columns[2:323]
    return pd.DataFrame(columns=col_names)

def convert_to_genotypes(template_df):
    choiceList = ['BB', 'AA', 'AB']
    all_indvs = template_df.columns
    for i in all_indvs:
        condList = [(template_df[i] > 1.5), (template_df[i] < .5), ((template_df[i] < 1.5) & (template_df[i] > .5))]
        template_df[i] = np.select(condList, choiceList)
    for i in template_df.columns:
        template_df[i].replace('0',np.nan,inplace=True)
    return template_df

def rename_SNP_bins(ds):
    bin_rename = pd.DataFrame(ds['index'].groupby(cut_bins).agg(['first', 'last']).stack())
    bin_rename = bin_rename.reset_index()
    bin_rename = bin_rename.pivot_table(index='level_0', columns='level_1', values =0,aggfunc='first', dropna=False)
    bin_rename = bin_rename.reset_index()
    return bin_rename


In [165]:
subset_file_list = glob.glob('data/*.csv')
ind_list = list_name_indivs(df)

template_df = create_append_template()
bin_df = create_bin_template()

for fil in subset_file_list:
    print(fil)
    #load subset of dataset that was subset by chromosome
    ds = pd.read_csv(fil)
    chromo_number = ds['chr'].iloc[0]
    #create bins
    bins = create_bins(ds)
    #create empty stats template
    stats_template_df = create_stats_template(ds, chromo_number)
    # cut each linkage group into bins of X SNPs each.
    cut_bins = cut_by_bins(ds, bins)
    # start renaming SNP bins
    SNP_bin_rename = rename_SNP_bins(ds)
    # create dataframe of stats
    stats_dataframe = create_stats_dataframe(ind_list, ds, cut_bins, chromo_number)
    # remove dfs with lots of NAs 
    filt_stats_dataframe = remove_high_NaNs(stats_dataframe, chromo_number)
    # pivot magic
    filt_stats_wide = narrow(filt_stats_dataframe)
    #cleanup col names
    #clean_df = cleanup_cols_names(filt_stats_wide)
    template_df = template_df.append(filt_stats_wide,ignore_index=True)
    bin_df = bin_df.append(SNP_bin_rename, ignore_index=True)
#     break
#     cleaned_ds.to_csv('final_data')

data/lg_chromo_9.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filt_stats_narrow['bin'] = filt_stats_narrow.index


data/lg_chromo_8.csv
data/lg_chromo_5.csv
data/lg_chromo_14.csv
data/lg_chromo_4.csv
data/lg_chromo_6.csv
data/lg_chromo_7.csv
data/lg_chromo_3.csv
data/lg_chromo_12.csv
data/lg_chromo_13.csv
data/lg_chromo_2.csv
data/lg_chromo_11.csv
data/lg_chromo_10.csv
data/lg_chromo_1.csv


In [166]:
# label the bins by the means of their SNP positions

tmpDF1 = pd.DataFrame(columns=['chr','site1', 'chr_', 'site2'])
tmpDF1[['chr','site1']] = bin_df['first'].str.split('_', expand=True)
tmpDF1[['chr_','site2']] = bin_df['last'].str.split('_', expand=True)

tmpDF1['sum'] = pd.to_numeric(tmpDF1['site1']) + pd.to_numeric(tmpDF1['site2'])

tmpDF1['mean'] = tmpDF1['sum'] / 2

tmpDF1['bin_mean'] = tmpDF1['chr']+"_"+tmpDF1['mean'].astype(str)

#add the means to the whole spreadsheet
template_df.insert(loc=0, column='bin_mean', value=tmpDF1['bin_mean'])


In [167]:
# add a chr column
template_df.insert(loc=0, column='chr', value=tmpDF1['chr'])

In [168]:
# add a pos column
template_df.insert(loc=0, column='pos', value=tmpDF1['mean'])

In [169]:
#i have figured out almost everything except how to replace the values w/AA AB BB, 
# and I know there is a way to do it without for loops but my brain can't do it right now
template_df.to_parquet("AHQsd_F2_SNPs_windowed.parquet")

In [173]:
td = pd.read_parquet("AHQsd_F2_SNPs_windowed.parquet")

In [174]:
td_geno = convert_to_genotypes(td.iloc[:, 3:324])

In [176]:
td_geno.insert(loc=0, column='bin_mean', value=td['bin_mean'])
td_geno.insert(loc=0, column='chr', value=td['chr'])
td_geno.insert(loc=0, column='pos', value=td['pos'])

In [178]:
td_geno.to_parquet("AHQsd_F2_genotypes_windowed.parquet")