In [1]:
import boto3
import collections

import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:

# Initialize S3 client
s3 = boto3.client('s3', region_name='us-west-2')

# Define bucket and prefix
bucket_name = 'human-pangenomics'
prefix = 'working/HPRC/'

# Function to recursively list all files in the specified prefix
def list_s3_files(bucket, prefix):
    files = []
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        if 'Contents' in page:
            for obj in page['Contents']:
                files.append(obj['Key'])
    return files

# List files
all_files = list_s3_files(bucket_name, prefix)

# Print all files with the desired s3 path format
s3_paths = [f"s3://{bucket_name}/{file_path}" for file_path in all_files]
illumina_working = [illumina for illumina in s3_paths if 'Illumina' in illumina]

illumina_df = pd.DataFrame(illumina_working, columns=['path'])

# Extract sample_id from file paths and create a new column
illumina_df['sample_ID'] = illumina_df['path'].str.extract(r'/HPRC/([^/]+)/')
illumina_df = illumina_df[~illumina_df['path'].str.endswith(('.final.cram.crai', '.final.cram.md5'))]

# child_paths = illumina_df[illumina_df['path'].str.contains('/child/')]
# parent_paths = illumina_df[illumina_df['path'].str.contains('/parents/')]
# parent_paths['parent_ID'] = [parent.split('/')[-1].split('.')[0] for parent in parent_paths['path'].tolist()]
# parent_paths.drop(columns=['sample_ID'],inplace=True)
# parent_paths.rename(columns={'parent_ID':'sample_ID'},inplace=True)

In [3]:
pedigree_df = pd.read_csv('/private/groups/hprc/human-pangenomics/hprc-synapse-1/HPRC_metadata/data/IGSR/20130606_g1k.ped',sep='\t')

In [4]:
illumina_table = pd.read_csv('Illumina_final_table.csv')
print(illumina_table.sample_ID.nunique())

281


In [5]:
trio_list = list()
for k,v in collections.Counter(illumina_df['sample_ID'].tolist()).items():
	if v != 1:
		trio_list.append(k)

In [31]:
len(collections.Counter(illumina_df['sample_ID'].tolist()).keys())

270

In [32]:
illumina_table.shape # 16 samples are HPRC+

(281, 20)

In [33]:
# aws working trios
illumina_trios = illumina_df[illumina_df['sample_ID'].isin(trio_list)]
illumina_trios['trio'] = [sample.split('/')[-1].split('.')[0] for sample in illumina_trios['path'].tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  illumina_trios['trio'] = [sample.split('/')[-1].split('.')[0] for sample in illumina_trios['path'].tolist()]


In [34]:
# Trio Children
illumina_trio_table = illumina_table[illumina_table['sample_ID'].isin(illumina_trios[illumina_trios['sample_ID'] == illumina_trios['trio']].sample_ID.tolist())]


In [35]:
pd.merge(illumina_trio_table,
		 illumina_trios[illumina_trios['trio'].isin(illumina_trio_table['sample_ID'])][['path','sample_ID']],
		 on='sample_ID',
		 how='inner').to_csv('HPRC_Illumina_Child.sample.index.csv',index=False)

In [36]:
# subset to maternal and paternal
illumina_parental_trios = illumina_trios[illumina_trios['sample_ID'] != illumina_trios['trio']]

# subset the trio children
illumina_parental_trios = illumina_table[illumina_table['sample_ID'].isin(illumina_parental_trios['trio'].unique())]

In [37]:
illumina_parental_trios.shape

(0, 20)

In [30]:
# no parental data
# illumina_table[illumina_table['sample_ID'].isin(illumina_trios['trio'].unique())]

In [39]:
illumina_remain_table = illumina_table[~illumina_table['sample_ID'].isin(illumina_trio_table['sample_ID'].tolist())]

In [41]:
illumina_remain_table.shape

(249, 20)

In [43]:
# # samples that are mother father, but no children data available
# illumina_remain_table[illumina_remain_table['Relationship'].isin(['mother','father'])].shape

In [44]:
# # samples that are mother father, but no children data available
illumina_remain_table[illumina_remain_table['Relationship'].isin(['mother','father'])].shape

(67, 20)

In [45]:
illumina_sample_table = pd.merge(illumina_remain_table,illumina_df[~illumina_df['sample_ID'].isin(trio_list)], on='sample_ID', how='inner')

In [47]:
illumina_sample_table.shape

(233, 21)

In [99]:
illumina_sample_table.to_csv('HPRC_Illumina.sample.index.csv',index=False)

In [87]:
# Filter out rows in illumina_df where sample_ID is in trio_list
filtered_illumina_df = illumina_df[~illumina_df['sample_ID'].isin(trio_list)]

# Check if sample_IDs in illumina_remain_table are not in the filtered illumina_df
result = illumina_remain_table[~illumina_remain_table['sample_ID'].isin(filtered_illumina_df['sample_ID'])]

result.shape


(16, 20)

In [49]:
illumina_remain_table.shape

(249, 20)

In [91]:
illumina_remain_table.shape[0] - 16

233

In [None]:
# assembly index table
assembly_index = pd.read_csv('/private/groups/hprc/human-pangenomics/hprc-data-index/hprc_intermediate_assembly/data_tables/assemblies_pre_release_v0.2.index.csv')


In [52]:
illumina_child_table = pd.merge(illumina_trio_table,
		 illumina_trios[illumina_trios['trio'].isin(illumina_trio_table['sample_ID'])][['path','sample_ID']],
		 on='sample_ID',
		 how='inner')

In [58]:
assembly_index[assembly_index['sample_id'].isin(illumina_child_table['sample_ID'].tolist())].shape

(30, 12)

In [61]:
illumina_sample_table[illumina_sample_table['sample_ID'].isin(assembly_index['sample_id'].tolist())].to_csv('/private/groups/hprc/human-pangenomics/hprc-data-index/hprc_intermediate_assembly/data_tables/data_Illumina_pre_release.index.csv',index=False)

Unnamed: 0,sample_ID,total_bp,coverage,filetype,instrument_model,library_construction_protocol,library_layout,library_strategy,read_length,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,Third Order,Other Comments,path
0,HG00097,114209267700,36.84,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00097,0,0,2,0,GBR,unrel,0,0,0,0,s3://human-pangenomics/working/HPRC/HG00097/ra...
3,HG00126,139636375200,45.04,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00126,0,0,1,0,GBR,unrel,0,0,0,0,s3://human-pangenomics/working/HPRC/HG00126/ra...
4,HG00128,147460895100,47.57,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00128,0,0,2,0,GBR,unrel,0,0,0,0,s3://human-pangenomics/working/HPRC/HG00128/ra...
5,HG00133,170378106600,54.96,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00133,0,0,2,0,GBR,unrel,0,0,0,0,s3://human-pangenomics/working/HPRC/HG00133/ra...
6,HG00146,147092152200,47.45,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,GBR002,0,0,2,0,GBR,child,HG00147,0,0,0,s3://human-pangenomics/working/HPRC/HG00146/ra...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,HG02976,114203101500,36.84,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,NG24,0,0,2,0,ESN,mother,0,0,0,0,s3://human-pangenomics/working/HPRC/HG02976/ra...
229,HG02135,117794844000,38.00,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,VN081,HG02137,HG02136,1,0,KHV,child,0,0,0,0,s3://human-pangenomics/working/HPRC/HG02135/ra...
230,HG02071,119457197700,38.53,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,VN065,HG02073,HG02072,1,0,KHV,child,0,0,0,0,s3://human-pangenomics/working/HPRC/HG02071/ra...
231,HG03710,115625752800,37.30,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,PK62,HG03708,HG03709,1,0,PJL,child,0,0,0,0,s3://human-pangenomics/working/HPRC/HG03710/ra...


In [47]:
[sample for sample in illumina_trios['trio'].tolist() if sample in illumina_table['sample_ID'].tolist()]

[]

In [39]:
len(set([sample for sample in illumina_trios['sample_ID'].tolist() + illumina_trios['trio'].tolist() if sample in illumina_table['sample_ID'].tolist()]))

32

In [11]:
# print(illumina_df.shape)
# print(parent_paths.shape)
# print(child_paths.shape)

In [10]:
assembly_index.head()

Unnamed: 0,sample_id,phasing,hap1_genbank_accession,hap2_genbank_accession,hap2_fa_gz_md5,hap2_fa_gz_fai,hap2_fa_gz_gzi,hap2_fa_gz,hap1_fa_gz_md5,hap1_fa_gz_fai,hap1_fa_gz_gzi,hap1_fa_gz
0,HG00408,trio,GCA_041900255.1,GCA_041900245.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...
1,HG00597,trio,GCA_041900365.1,GCA_041900265.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...
2,HG01192,trio,GCA_041900145.1,GCA_041900275.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...
3,HG01261,trio,GCA_041900235.1,GCA_041899995.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...
4,HG02015,trio,GCA_041900165.1,GCA_041900105.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...


In [174]:
# subset to illumina trios in assembly index
illumina_trio = illumina_table[illumina_table['sample_ID'].isin(assembly_index[assembly_index['phasing'].isin(['trio'])].sample_id.tolist())] 

In [2]:
child = pd.read_csv('HPRC_Illumina_Child.sample.index.csv')
parent = pd.read_csv('HPRC_Illumina_Parental.sample.index.csv')
nore = pd.read_csv('HPRC_Illumina.sample.index.csv')