In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, random_split, DataLoader

# Set a random seed for reproducibility
torch.manual_seed(42)

# Load your data into a pandas DataFrame
data = pd.read_csv('./concatenated_snps.csv', index_col=0)


In [2]:
data

Unnamed: 0,chr2_192381934_C_T,chr8_129199566_G_A,"chr4_126752992_A_AAT,AATAT",chr8_29509616_A_C,chr2_121089731_T_C,chr15_75750383_T_C,chr18_24330269_A_<CN0>,chr18_24337424_C_G,chr5_131640536_A_G,chr16_4008542_CAAAAA_C,...,chr12_223587_A_<CN2>,chr12_266463_A_<CN2>,chr12_292172_T_<CN0>,chr12_293626_A_G,chr16_54682064_G_A,chr9_110303808_TAA_T,chr5_158180107_T_<CN2>,chr5_158244083_C_T,chr11_42844441_C_T,chr12_120832146_C_T
HG00096,1|1,0|1,0|1,1|1,1|0,1|0,0|0,1|1,0|1,1|1,...,0|0,0|0,0|0,0|0,1|1,0|0,0|0,0|1,0|0,0|0
HG00097,1|1,1|0,1|1,1|1,0|0,0|0,0|0,0|1,1|1,1|1,...,0|0,0|0,0|0,1|0,0|0,0|0,0|0,1|0,0|0,1|0
HG00099,1|1,0|0,0|1,0|0,0|1,0|0,0|0,1|1,0|1,1|1,...,0|0,0|0,0|0,1|0,1|1,0|0,0|0,0|1,0|0,0|0
HG00100,0|1,0|0,1|0,0|1,0|0,0|0,0|0,0|1,1|0,1|1,...,0|0,0|0,0|0,1|0,1|1,0|0,0|0,1|0,1|1,0|1
HG00101,1|1,0|0,1|0,1|1,1|0,1|1,0|0,1|1,1|1,1|1,...,0|0,0|0,0|0,0|0,0|1,0|0,0|0,1|1,0|0,0|1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,1|1,0|0,0|0,1|1,0|0,0|0,0|0,0|1,0|1,0|1,...,0|0,0|0,0|0,1|0,1|1,0|1,0|0,1|1,0|0,0|0
NA21141,1|1,0|0,1|1,0|0,0|0,0|1,0|0,1|1,1|1,1|1,...,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,0|1,0|0
NA21142,1|0,1|0,0|0,0|1,0|0,1|0,0|0,0|1,0|1,1|1,...,0|0,0|0,0|0,0|1,0|0,0|0,0|0,1|0,0|1,1|0
NA21143,1|1,0|0,1|0,1|1,0|0,1|1,0|0,0|1,0|1,1|1,...,0|0,0|0,0|0,1|0,1|1,0|0,0|0,1|1,1|0,0|0


In [3]:
# Get the chromosome positions in PRS313 from ./positions/locations.txt
PRS313_positions = pd.read_excel('../PRS313.xlsx')

# Rename the SNPa column to have a "chr" at the beginning
PRS313_positions['SNPa'] = 'chr' + PRS313_positions['SNPa'].astype(str)



In [4]:
# Filter the data to only include the positions in PRS313 

excluded_data = data.loc[:, ~data.columns.isin(PRS313_positions['SNPa'])]


data = data.loc[:, data.columns.isin(PRS313_positions['SNPa'])]

data


Unnamed: 0,chr2_192381934_C_T,chr8_129199566_G_A,chr8_29509616_A_C,chr2_121089731_T_C,chr15_75750383_T_C,chr18_24337424_C_G,chr5_131640536_A_G,chr16_4008542_CAAAAA_C,chr16_10706580_G_A,chr9_110885479_C_T,...,chr1_51467096_CT_C,chr5_345109_T_C,chr6_152023191_G_A,chr12_96027759_A_G,chr12_293626_A_G,chr16_54682064_G_A,chr9_110303808_TAA_T,chr5_158244083_C_T,chr11_42844441_C_T,chr12_120832146_C_T
HG00096,1|1,0|1,1|1,1|0,1|0,1|1,0|1,1|1,0|0,1|1,...,1|1,0|0,0|0,0|0,0|0,1|1,0|0,0|1,0|0,0|0
HG00097,1|1,1|0,1|1,0|0,0|0,0|1,1|1,1|1,0|0,1|0,...,1|1,0|0,0|0,0|0,1|0,0|0,0|0,1|0,0|0,1|0
HG00099,1|1,0|0,0|0,0|1,0|0,1|1,0|1,1|1,0|0,1|0,...,0|0,0|0,1|0,1|0,1|0,1|1,0|0,0|1,0|0,0|0
HG00100,0|1,0|0,0|1,0|0,0|0,0|1,1|0,1|1,0|0,1|1,...,0|0,0|0,1|0,0|0,1|0,1|1,0|0,1|0,1|1,0|1
HG00101,1|1,0|0,1|1,1|0,1|1,1|1,1|1,1|1,0|0,0|1,...,0|0,0|0,1|0,0|1,0|0,0|1,0|0,1|1,0|0,0|1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,1|1,0|0,1|1,0|0,0|0,0|1,0|1,0|1,0|0,1|1,...,1|0,0|0,0|1,1|1,1|0,1|1,0|1,1|1,0|0,0|0
NA21141,1|1,0|0,0|0,0|0,0|1,1|1,1|1,1|1,0|0,1|1,...,0|0,0|0,0|1,1|0,0|0,1|0,0|0,0|0,0|1,0|0
NA21142,1|0,1|0,0|1,0|0,1|0,0|1,0|1,1|1,1|0,1|1,...,1|1,0|0,0|0,0|1,0|1,0|0,0|0,1|0,0|1,1|0
NA21143,1|1,0|0,1|1,0|0,1|1,0|1,0|1,1|1,0|0,1|1,...,1|0,0|0,0|1,1|1,1|0,1|1,0|0,1|1,1|0,0|0


In [5]:
data.max().max()

'1|1'

In [6]:
excluded_data

Unnamed: 0,"chr4_126752992_A_AAT,AATAT",chr18_24330269_A_<CN0>,chr16_10699603_G_<CN0>,chr17_39054216_A_<CN2>,chr17_39237570_G_<CN0>,"chr17_39242768_T_<CN0>,<CN2>",chr2_217955891_T_<CN0>,chr22_45319942_GGGAGGGAGGGGAGGGAAGAGGGGAGGAAGGAGGAGA_G,chr6_130169321_A_<CN2>,chr15_100851435_G_<CN2>,...,"chr10_22861490_A_C,T",chr11_46198107_A_<CN2>,chr11_46202442_T_<CN2>,chr11_46249492_A_<CN2>,chr11_46317010_G_<CN2>,chr5_325948_A_<CN2>,chr12_223587_A_<CN2>,chr12_266463_A_<CN2>,chr12_292172_T_<CN0>,chr5_158180107_T_<CN2>
HG00096,0|1,0|0,0|0,0|0,0|0,0|0,0|0,1|1,0|0,0|0,...,1|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00097,1|1,0|0,0|0,0|0,0|0,0|0,1|0,1|1,0|0,0|0,...,1|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00099,0|1,0|0,0|0,0|0,0|0,0|0,0|0,1|1,0|0,0|0,...,1|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00100,1|0,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,...,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00101,1|0,0|0,0|0,0|0,0|0,0|0,0|0,1|1,0|0,0|0,...,1|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,0|0,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,...,1|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
NA21141,1|1,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,...,1|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
NA21142,0|0,0|0,0|0,0|0,0|0,0|0,0|0,1|1,0|0,0|0,...,1|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
NA21143,1|0,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,...,2|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [7]:
# Find the positions in PRS313 that are not in the data
missing_positions = PRS313_positions.loc[~PRS313_positions['SNPa'].isin(data.columns)]

missing_positions

Unnamed: 0,SNPa,Chromosome,Positionb,Reference Allele,Effect Allele,EAFc,Overall Breast Cancerd,ER-positivee,ER-negativef,hybrid ER-positiveg,hybrid ER-negativeh
63,chr3_63887449_T_TTG,3,63887449,T,TTG,0.129725,0.0648,0.0627,0.043,0.0648,0.0648
68,chr4_126752992_A_AAT,4,126752992,A,AAT,0.516677,-0.0377,-0.0361,-0.0638,-0.0377,-0.0377
73,chr4_187503758_A_T,4,187503758,A,T,0.447053,0.0357,0.0352,0.0194,0.0357,0.0357
75,chr4_84370124_TAA_TA,4,84370124,TAA,TA,0.532408,-0.0464,-0.0438,-0.0489,-0.0464,-0.0464
90,chr5_176134882_T_C,5,176134882,T,C,0.542245,0.0363,0.0368,0.0262,0.0363,0.0363
99,chr5_52679539_C_CA,5,52679539,C,CA,0.099758,0.0571,0.0663,0.042,0.0571,0.0571
142,chr7_91459189_A_ATT,7,91459189,A,ATT,0.32858,0.0452,0.0439,0.0486,0.0452,0.0452
190,chr10_22861490_A_C,10,22861490,A,C,0.936979,0.0875,0.096,0.0201,0.096,0.0201
307,chr22_38583315_AAAAG_AAAAGAAAG,22,38583315,AAAAG,AAAAGAAAG,0.280467,-0.0471,-0.0608,0.0079,-0.0608,0.0079


In [8]:
data

Unnamed: 0,chr2_192381934_C_T,chr8_129199566_G_A,chr8_29509616_A_C,chr2_121089731_T_C,chr15_75750383_T_C,chr18_24337424_C_G,chr5_131640536_A_G,chr16_4008542_CAAAAA_C,chr16_10706580_G_A,chr9_110885479_C_T,...,chr1_51467096_CT_C,chr5_345109_T_C,chr6_152023191_G_A,chr12_96027759_A_G,chr12_293626_A_G,chr16_54682064_G_A,chr9_110303808_TAA_T,chr5_158244083_C_T,chr11_42844441_C_T,chr12_120832146_C_T
HG00096,1|1,0|1,1|1,1|0,1|0,1|1,0|1,1|1,0|0,1|1,...,1|1,0|0,0|0,0|0,0|0,1|1,0|0,0|1,0|0,0|0
HG00097,1|1,1|0,1|1,0|0,0|0,0|1,1|1,1|1,0|0,1|0,...,1|1,0|0,0|0,0|0,1|0,0|0,0|0,1|0,0|0,1|0
HG00099,1|1,0|0,0|0,0|1,0|0,1|1,0|1,1|1,0|0,1|0,...,0|0,0|0,1|0,1|0,1|0,1|1,0|0,0|1,0|0,0|0
HG00100,0|1,0|0,0|1,0|0,0|0,0|1,1|0,1|1,0|0,1|1,...,0|0,0|0,1|0,0|0,1|0,1|1,0|0,1|0,1|1,0|1
HG00101,1|1,0|0,1|1,1|0,1|1,1|1,1|1,1|1,0|0,0|1,...,0|0,0|0,1|0,0|1,0|0,0|1,0|0,1|1,0|0,0|1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,1|1,0|0,1|1,0|0,0|0,0|1,0|1,0|1,0|0,1|1,...,1|0,0|0,0|1,1|1,1|0,1|1,0|1,1|1,0|0,0|0
NA21141,1|1,0|0,0|0,0|0,0|1,1|1,1|1,1|1,0|0,1|1,...,0|0,0|0,0|1,1|0,0|0,1|0,0|0,0|0,0|1,0|0
NA21142,1|0,1|0,0|1,0|0,1|0,0|1,0|1,1|1,1|0,1|1,...,1|1,0|0,0|0,0|1,0|1,0|0,0|0,1|0,0|1,1|0
NA21143,1|1,0|0,1|1,0|0,1|1,0|1,0|1,1|1,0|0,1|1,...,1|0,0|0,0|1,1|1,1|0,1|1,0|0,1|1,1|0,0|0


In [9]:
import numpy as np

def replace_non_index(value, index):
    
    alleles = value.split('|')
    
    processed_alleles = []
    for allele in alleles:
        if int(allele) == 0 or int(allele) != int(index):
            processed_alleles.append("0")
        else:
            processed_alleles.append("1")
    

    if (int(alleles[0]) > 1):
        print("PRS313 SNP: ", index)
        print("Original Data: ",alleles,"Processed Data: ", '|'.join(processed_alleles))

    return '|'.join(processed_alleles)

# Loop through the missing positions and add them to the data with the values in excluded_data
for index, row in missing_positions.iterrows():


    chromosome = row.Chromosome
    position = row.Positionb
    snp = 'chr' + str(chromosome) + '_' + str(position)
    PRS313_alternate_alleles = row["Effect Allele"]

    # Find in excluded data the columns with names that begin with snp
    snp_data = excluded_data.filter(like=snp)

    excluded_data_snp = snp_data.columns[0]

    excluded_data_snp_alternate_alleles = excluded_data_snp.split('_')[3].split(',')
    
    # Find index of PRS313_alternate_alleles in excluded_data_snp_alternate_alleles

    index = excluded_data_snp_alternate_alleles.index(PRS313_alternate_alleles) + 1

    # print("PRS313 SNP: ", PRS313_alternate_alleles)
    # print("Excluded data SNP: ", excluded_data_snp_alternate_alleles)
    # print("Alternate Allele value of PRS313 SNP in 1000 Genomes VCF: ",index)

    snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
            
    
    # Add snp_data to data
    data = pd.concat([data, snp_data], axis=1)

PRS313 SNP:  1
Original Data:  ['2', '1'] Processed Data:  0|1
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '1'] Processed Data:  0|1
PRS313 SNP:  1
Original Data:  ['2', '1'] Processed Data:  0|1
PRS313 SNP:  1
Original Data:  ['2', '1'] Processed Data:  0|1
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '1'] Processed Data:  0|1
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '2'] Processed Data:  0|0
PRS313 SNP:  1
Original Data:  ['2', '1'] Processed Data:  0|1
PRS313 SNP:  1
Original Data:  ['2', '1'] Processed Data:  0|1
PRS313 SNP:  1
Original Data:  ['2', '0'] Processed Dat

  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))
  snp_data = snp_data.applymap(lambda x: replace_non_index(x, index))


In [10]:
data

Unnamed: 0,chr2_192381934_C_T,chr8_129199566_G_A,chr8_29509616_A_C,chr2_121089731_T_C,chr15_75750383_T_C,chr18_24337424_C_G,chr5_131640536_A_G,chr16_4008542_CAAAAA_C,chr16_10706580_G_A,chr9_110885479_C_T,...,chr12_120832146_C_T,"chr3_63887449_T_TTG,TTGTG","chr4_126752992_A_AAT,AATAT","chr4_187503758_A_G,T","chr4_84370124_TAA_TA,T","chr5_176134882_T_A,C","chr5_52679539_C_CA,CAA","chr7_91459189_A_AT,ATT","chr10_22861490_A_C,T","chr22_38583315_AAAAG_AAAAGAAAG,AAAAGAAAGAAAG,A"
HG00096,1|1,0|1,1|1,1|0,1|0,1|1,0|1,1|1,0|0,1|1,...,0|0,0|0,0|1,0|1,0|1,0|1,0|0,1|0,1|1,0|0
HG00097,1|1,1|0,1|1,0|0,0|0,0|1,1|1,1|1,0|0,1|0,...,1|0,0|1,1|1,0|1,1|0,1|0,0|0,1|0,1|1,0|0
HG00099,1|1,0|0,0|0,0|1,0|0,1|1,0|1,1|1,0|0,1|0,...,0|0,0|0,0|1,1|1,0|0,0|0,0|0,1|1,1|1,1|1
HG00100,0|1,0|0,0|1,0|0,0|0,0|1,1|0,1|1,0|0,1|1,...,0|1,1|0,1|0,1|0,1|1,1|1,0|0,0|1,0|1,0|0
HG00101,1|1,0|0,1|1,1|0,1|1,1|1,1|1,1|1,0|0,0|1,...,0|1,0|1,1|0,1|0,1|0,1|0,0|0,0|0,1|1,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,1|1,0|0,1|1,0|0,0|0,0|1,0|1,0|1,0|0,1|1,...,0|0,0|0,0|0,0|1,1|0,1|0,0|0,0|0,1|1,1|0
NA21141,1|1,0|0,0|0,0|0,0|1,1|1,1|1,1|1,0|0,1|1,...,0|0,0|0,1|1,0|0,1|0,1|1,0|0,1|1,1|1,0|1
NA21142,1|0,1|0,0|1,0|0,1|0,0|1,0|1,1|1,1|0,1|1,...,1|0,0|1,0|0,0|1,0|1,1|1,0|0,0|1,1|1,0|1
NA21143,1|1,0|0,1|1,0|0,1|1,0|1,0|1,1|1,0|0,1|1,...,0|0,0|0,1|0,0|1,1|1,0|0,1|0,1|0,0|1,1|0


In [11]:
data.to_csv('./concatenated_snps_processed.csv')

In [312]:
# Split into maternal/paternal columns
new_cols = []
for col in data.columns:
    maternal_col = f"{col.replace('_', ':')}_maternal"
    paternal_col = f"{col.replace('_', ':')}_paternal"
    new_cols.extend([maternal_col, paternal_col])

# Update the new columns with the desired values
for col in data.columns:
    maternal_col = f"{col.replace('_', ':')}_maternal"
    paternal_col = f"{col.replace('_', ':')}_paternal"
    print(data[col])
    data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
    data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]

# Reorder the columns
data = data[new_cols]

HG00096    1|1
HG00097    1|1
HG00099    1|1
HG00100    0|1
HG00101    1|1
          ... 
NA21137    1|1
NA21141    1|1
NA21142    1|0
NA21143    1|1
NA21144    1|1
Name: chr2_192381934_C_T, Length: 2504, dtype: object
HG00096    0|1
HG00097    1|0
HG00099    0|0
HG00100    0|0
HG00101    0|0
          ... 
NA21137    0|0
NA21141    0|0
NA21142    1|0
NA21143    0|0
NA21144    0|0
Name: chr8_129199566_G_A, Length: 2504, dtype: object
HG00096    1|1
HG00097    1|1
HG00099    0|0
HG00100    0|1
HG00101    1|1
          ... 
NA21137    1|1
NA21141    0|0
NA21142    0|1
NA21143    1|1
NA21144    0|1
Name: chr8_29509616_A_C, Length: 2504, dtype: object
HG00096    1|0
HG00097    0|0
HG00099    0|1
HG00100    0|0
HG00101    1|0
          ... 
NA21137    0|0
NA21141    0|0
NA21142    0|0
NA21143    0|0
NA21144    0|0
Name: chr2_121089731_T_C, Length: 2504, dtype: object
HG00096    1|0
HG00097    0|0
HG00099    0|0
HG00100    0|0
HG00101    1|1
          ... 
NA21137    0|0
NA21141    0|1
NA211

  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [i

HG00096    0|1
HG00097    0|1
HG00099    1|1
HG00100    1|0
HG00101    1|0
          ... 
NA21137    0|1
NA21141    0|0
NA21142    0|1
NA21143    0|1
NA21144    0|0
Name: chr4_187503758_A_G,T, Length: 2504, dtype: object
HG00096    0|1
HG00097    1|0
HG00099    0|0
HG00100    1|1
HG00101    1|0
          ... 
NA21137    1|0
NA21141    1|0
NA21142    0|1
NA21143    1|1
NA21144    1|0
Name: chr4_84370124_TAA_TA,T, Length: 2504, dtype: object
HG00096    0|1
HG00097    1|0
HG00099    0|0
HG00100    1|1
HG00101    1|0
          ... 
NA21137    1|0
NA21141    1|1
NA21142    1|1
NA21143    0|0
NA21144    1|1
Name: chr5_176134882_T_A,C, Length: 2504, dtype: object
HG00096    0|0
HG00097    0|0
HG00099    0|0
HG00100    0|0
HG00101    0|0
          ... 
NA21137    0|0
NA21141    0|0
NA21142    0|0
NA21143    1|0
NA21144    0|0
Name: chr5_52679539_C_CA,CAA, Length: 2504, dtype: object
HG00096    1|0
HG00097    1|0
HG00099    1|1
HG00100    0|1
HG00101    0|0
          ... 
NA21137    0|0
NA21141

  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [int(allele.split("|")[1]) for allele in data[col]]
  data[maternal_col] = [int(allele.split("|")[0]) for allele in data[col]]
  data[paternal_col] = [i

In [313]:
data

Unnamed: 0,chr2:192381934:C:T_maternal,chr2:192381934:C:T_paternal,chr8:129199566:G:A_maternal,chr8:129199566:G:A_paternal,chr8:29509616:A:C_maternal,chr8:29509616:A:C_paternal,chr2:121089731:T:C_maternal,chr2:121089731:T:C_paternal,chr15:75750383:T:C_maternal,chr15:75750383:T:C_paternal,...,"chr5:176134882:T:A,C_maternal","chr5:176134882:T:A,C_paternal","chr5:52679539:C:CA,CAA_maternal","chr5:52679539:C:CA,CAA_paternal","chr7:91459189:A:AT,ATT_maternal","chr7:91459189:A:AT,ATT_paternal","chr10:22861490:A:C,T_maternal","chr10:22861490:A:C,T_paternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_maternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_paternal"
HG00096,1,1,0,1,1,1,1,0,1,0,...,0,1,0,0,1,0,1,1,0,0
HG00097,1,1,1,0,1,1,0,0,0,0,...,1,0,0,0,1,0,1,1,0,0
HG00099,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,1,1,1,1
HG00100,0,1,0,0,0,1,0,0,0,0,...,1,1,0,0,0,1,0,1,0,0
HG00101,1,1,0,0,1,1,1,0,1,1,...,1,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,1,1,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,1,1,1,0
NA21141,1,1,0,0,0,0,0,0,0,1,...,1,1,0,0,1,1,1,1,0,1
NA21142,1,0,1,0,0,1,0,0,1,0,...,1,1,0,0,0,1,1,1,0,1
NA21143,1,1,0,0,1,1,0,0,1,1,...,0,0,1,0,1,0,0,1,1,0


In [314]:
# Get the number of rows where a value of greater than 1 appeared
data.loc[(data > 1).any(axis=1)]


Unnamed: 0,chr2:192381934:C:T_maternal,chr2:192381934:C:T_paternal,chr8:129199566:G:A_maternal,chr8:129199566:G:A_paternal,chr8:29509616:A:C_maternal,chr8:29509616:A:C_paternal,chr2:121089731:T:C_maternal,chr2:121089731:T:C_paternal,chr15:75750383:T:C_maternal,chr15:75750383:T:C_paternal,...,"chr5:176134882:T:A,C_maternal","chr5:176134882:T:A,C_paternal","chr5:52679539:C:CA,CAA_maternal","chr5:52679539:C:CA,CAA_paternal","chr7:91459189:A:AT,ATT_maternal","chr7:91459189:A:AT,ATT_paternal","chr10:22861490:A:C,T_maternal","chr10:22861490:A:C,T_paternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_maternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_paternal"


In [315]:
# Get the number of columns where a value of greater than 1 appeared

data.loc[:, (data > 1).any()]

HG00096
HG00097
HG00099
HG00100
HG00101
...
NA21137
NA21141
NA21142
NA21143
NA21144


In [316]:
# Convert all the values to booleans
data = data.astype(bool)
data.dtypes

chr2:192381934:C:T_maternal                                bool
chr2:192381934:C:T_paternal                                bool
chr8:129199566:G:A_maternal                                bool
chr8:129199566:G:A_paternal                                bool
chr8:29509616:A:C_maternal                                 bool
                                                           ... 
chr7:91459189:A:AT,ATT_paternal                            bool
chr10:22861490:A:C,T_maternal                              bool
chr10:22861490:A:C,T_paternal                              bool
chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_maternal    bool
chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_paternal    bool
Length: 626, dtype: object

In [320]:
data

Unnamed: 0,chr2:192381934:C:T_maternal,chr2:192381934:C:T_paternal,chr8:129199566:G:A_maternal,chr8:129199566:G:A_paternal,chr8:29509616:A:C_maternal,chr8:29509616:A:C_paternal,chr2:121089731:T:C_maternal,chr2:121089731:T:C_paternal,chr15:75750383:T:C_maternal,chr15:75750383:T:C_paternal,...,"chr5:176134882:T:A,C_maternal","chr5:176134882:T:A,C_paternal","chr5:52679539:C:CA,CAA_maternal","chr5:52679539:C:CA,CAA_paternal","chr7:91459189:A:AT,ATT_maternal","chr7:91459189:A:AT,ATT_paternal","chr10:22861490:A:C,T_maternal","chr10:22861490:A:C,T_paternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_maternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_paternal"
HG00096,True,True,False,True,True,True,True,False,True,False,...,False,True,False,False,True,False,True,True,False,False
HG00097,True,True,True,False,True,True,False,False,False,False,...,True,False,False,False,True,False,True,True,False,False
HG00099,True,True,False,False,False,False,False,True,False,False,...,False,False,False,False,True,True,True,True,True,True
HG00100,False,True,False,False,False,True,False,False,False,False,...,True,True,False,False,False,True,False,True,False,False
HG00101,True,True,False,False,True,True,True,False,True,True,...,True,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,True,True,False,False,True,True,False,False,False,False,...,True,False,False,False,False,False,True,True,True,False
NA21141,True,True,False,False,False,False,False,False,False,True,...,True,True,False,False,True,True,True,True,False,True
NA21142,True,False,True,False,False,True,False,False,True,False,...,True,True,False,False,False,True,True,True,False,True
NA21143,True,True,False,False,True,True,False,False,True,True,...,False,False,True,False,True,False,False,True,True,False


In [321]:
# Save the dataframe
data.to_csv('./concatenated_snps_processed.csv')

In [None]:
# Sort