# Generating trio-specific files

## Load and reformat data

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
df_trios = pd.read_csv("../../data/1000Genomes/1000_genomes_triad_pedigrees.csv")

df_trios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1806 entries, 0 to 1805
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     1806 non-null   int64 
 1   triadID        1806 non-null   int64 
 2   sampleID       1806 non-null   object
 3   role           1806 non-null   object
 4   pgx_id         1806 non-null   object
 5   sex            1806 non-null   object
 6   ENA_FILE_PATH  1806 non-null   object
dtypes: int64(2), object(5)
memory usage: 98.9+ KB


In [3]:
df_trios.head()

Unnamed: 0.1,Unnamed: 0,triadID,sampleID,role,pgx_id,sex,ENA_FILE_PATH
0,0,0,HG00403,father,onekgbs-HG00403,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
1,1,0,HG00404,mother,onekgbs-HG00404,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
2,2,0,HG00405,child,onekgbs-HG00405,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR398/ERR398...
3,3,1,HG00406,father,onekgbs-HG00406,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
4,4,1,HG00407,mother,onekgbs-HG00407,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...


Progenetix output file needs to be modified a bit. Rename columns, split sample ids...

In [4]:
df_cnvs = (pd.read_csv("../../data/1000Genomes/2024-04-02_trio_cn_variants.csv")
           .assign(
               chromosome = lambda x: pd.Categorical(f"chr{i}" for i in x["location.chromosome"]),
               sampleID = lambda x: [i[1] for i in x["biosample_id"].str.split("-")])
           .rename(columns={"info.cn_count": "cn", "location.start": "start", "location.end": "end"})
           .drop(["location.chromosome", "biosample_id"], axis=1))

df_cnvs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104925 entries, 0 to 1104924
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype   
---  ------      --------------    -----   
 0   cn          1104925 non-null  int64   
 1   start       1104925 non-null  int64   
 2   end         1104925 non-null  int64   
 3   chromosome  1104925 non-null  category
 4   sampleID    1104925 non-null  object  
dtypes: category(1), int64(3), object(1)
memory usage: 34.8+ MB


In [5]:
df_cnvs.describe()

Unnamed: 0,cn,start,end
count,1104925.0,1104925.0,1104925.0
mean,1.943783,74321920.0,74362520.0
std,9.577667,56284100.0,56257260.0
min,0.0,11534.0,13775.0
25%,1.0,29055190.0,29124870.0
50%,1.0,59419460.0,59430020.0
75%,1.0,112148900.0,112161900.0
max,589.0,248926000.0,248930600.0


In [6]:
df_cnvs.head()

Unnamed: 0,cn,start,end,chromosome,sampleID
0,1,14108578,14111579,chr1,HG00403
1,0,2650426,2651463,chr1,HG00403
2,1,93821799,93825244,chr1,HG00403
3,1,84246288,84250383,chr1,HG00403
4,1,103362729,103365638,chr1,HG00403


## Merge dataframes, iterate over trios and write to files

In [7]:
output_dir = "../../data/1000Genomes/cnvs_per_trio/"
for name, data in df_trios.merge(df_cnvs, on = "sampleID", how="left").groupby(["triadID", "sampleID"]):    
    output_file = os.path.join(output_dir, str(name[0]), f"{name[1]}_cnvs.bed")
    
    if data["cn"].isna().sum() > 0:
        print(f"No CNVs for {name[1]}")
        continue

    # ensure that we're writing integers
    data = data[["chromosome", "start", "end", "cn"]].assign(
        start = lambda x: x["start"].astype(int),
        end = lambda x: x["end"].astype(int),
        cn = lambda x: x["cn"].astype(int),
    )
    
    data.to_csv(output_file, index=False, header=False, sep="\t")

No CNVs for NA20358


Check: there should be 1805 bed files: there are 1806 total samples minus the one sample for which there were no CNVs. Unclear whether this is because this sample is perfectly CN 2 or whether it was excluded from Progenetix somehow.

**Note:** some bed files will actually be duplicates since some samples belong to more than one trio.  

In [9]:
!find ../../data/1000Genomes/cnvs_per_trio/ -name "*_cnvs.bed" | wc -l

    1805
