# Reformatting 1000 Genomes triads pedigree file

Original file here: [http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/1kGP.3202_samples.pedigree_info.txt](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/1kGP.3202_samples.pedigree_info.txt)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_pedigree = (pd.read_csv("../../data/1000Genomes/1kGP.3202_samples.pedigree_info.txt", sep=" ")
               .assign(sex = lambda x: ['M' if i == 1 else 'F' for i in x["sex"]]))
df_pedigree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3202 entries, 0 to 3201
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sampleID  3202 non-null   object
 1   fatherID  3202 non-null   object
 2   motherID  3202 non-null   object
 3   sex       3202 non-null   object
dtypes: object(4)
memory usage: 100.2+ KB


In [3]:
df_pedigree.head()

Unnamed: 0,sampleID,fatherID,motherID,sex
0,HG00096,0,0,M
1,HG00097,0,0,F
2,HG00099,0,0,F
3,HG00100,0,0,F
4,HG00101,0,0,M


In [4]:
value_map = {"sampleID": "child", "fatherID": "father", "motherID": "mother"}
df_triads = (df_pedigree[["sampleID", "fatherID", "motherID"]]
             .query("fatherID != '0' and motherID != '0'")             
             .assign(triadID = lambda x: np.arange(0, x.shape[0], 1))
             .melt(id_vars="triadID")
             .assign(
                 role = lambda x: [value_map[i] for i in x["variable"]],
                 pgx_id = lambda x: [f"onekgbs-{i}" for i in x["value"]])
             .drop("variable", axis=1)             
             .sort_values(["triadID", "value"])
             .reset_index(drop=True))

df_triads = df_triads.rename(columns={"value": "sampleID", "variable": "role"}).merge(df_pedigree[["sampleID", "sex"]], on="sampleID")

df_triads

Unnamed: 0,triadID,sampleID,role,pgx_id,sex
0,0,HG00403,father,onekgbs-HG00403,M
1,0,HG00404,mother,onekgbs-HG00404,F
2,0,HG00405,child,onekgbs-HG00405,F
3,1,HG00406,father,onekgbs-HG00406,M
4,1,HG00407,mother,onekgbs-HG00407,F
...,...,...,...,...,...
1801,600,NA20279,child,onekgbs-NA20279,M
1802,600,NA20282,mother,onekgbs-NA20282,F
1803,601,NA20356,father,onekgbs-NA20356,M
1804,601,NA20357,mother,onekgbs-NA20357,F


### Parents with multiple children

Looks like some parents have more than one child that is part of the 1000 Genomes data, or a child itself has a child that then also becomes part of the 1000 Genomes data. This means some samples show up in more than one triad. I don't think this is an issue, as long as we keep it in mind when we're e.g. querying Progenetix.

In [5]:
print(df_triads.shape[0], df_triads["sampleID"].nunique())
df_triads["sampleID"].value_counts().sort_values()[df_triads["sampleID"].value_counts().sort_values() > 1].sort_index()

1806 1793


sampleID
HG00656    2
HG00657    2
HG00702    2
HG03642    2
HG03679    2
HG03943    2
HG03944    2
NA19660    2
NA19661    2
NA19675    2
NA19678    2
NA19679    2
NA19685    2
Name: count, dtype: int64

In [6]:
df_triads.query("sampleID == 'HG03642'")

Unnamed: 0,triadID,sampleID,role,pgx_id,sex
1320,440,HG03642,mother,onekgbs-HG03642,F
1321,441,HG03642,mother,onekgbs-HG03642,F


In [7]:
df_triads.query("triadID == 440")

Unnamed: 0,triadID,sampleID,role,pgx_id,sex
1320,440,HG03642,mother,onekgbs-HG03642,F
1322,440,HG03679,father,onekgbs-HG03679,M
1324,440,HG04204,child,onekgbs-HG04204,M


In [8]:
df_triads.query("triadID == 441")

Unnamed: 0,triadID,sampleID,role,pgx_id,sex
1321,441,HG03642,mother,onekgbs-HG03642,F
1323,441,HG03679,father,onekgbs-HG03679,M
1325,441,HG04215,child,onekgbs-HG04215,M


# Adding ftp links to alignment files

In [17]:
def load_ena_index_file(url):
    df = pd.read_csv(
        url,
        comment="#",
        header=None,
        sep="\t",
        usecols=[0, 9],
        names=["ENA_FILE_PATH", "sampleID"]
    )

    return df

In [40]:
df_related = load_ena_index_file("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/1000G_698_related_high_coverage.sequence.index")
df_unrelated = load_ena_index_file("https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/1000G_2504_high_coverage.sequence.index")

df_triads = df_triads.merge(pd.concat([df_related, df_unrelated], axis=0), on="sampleID", how="left")
df_triads

Unnamed: 0,triadID,sampleID,role,pgx_id,sex,ENA_FILE_PATH
0,0,HG00403,father,onekgbs-HG00403,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
1,0,HG00404,mother,onekgbs-HG00404,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
2,0,HG00405,child,onekgbs-HG00405,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR398/ERR398...
3,1,HG00406,father,onekgbs-HG00406,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
4,1,HG00407,mother,onekgbs-HG00407,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
...,...,...,...,...,...,...
1801,600,NA20279,child,onekgbs-NA20279,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR398/ERR398...
1802,600,NA20282,mother,onekgbs-NA20282,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR323...
1803,601,NA20356,father,onekgbs-NA20356,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR323...
1804,601,NA20357,mother,onekgbs-NA20357,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR323...


In [41]:
df_triads[df_triads["ENA_FILE_PATH"].isna()]

Unnamed: 0,triadID,sampleID,role,pgx_id,sex,ENA_FILE_PATH
796,265,HG02635,mother,onekgbs-HG02635,F,
988,329,HG03025,mother,onekgbs-HG03025,F,
1119,373,HG03366,mother,onekgbs-HG03366,F,


There are three samples in df_triads for which no information is included in the index files from ENA. Not sure why this is... If I look these samples up manually in the 1000 Genomes data portal ([https://www.internationalgenome.org/data-portal/sample](https://www.internationalgenome.org/data-portal/sample)), they do have alignments available. So I'll just add them manually

In [50]:
url_list = [
    "ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3242420/HG02635.final.cram",
    "ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3242524/HG03025.final.cram",
    "ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3242599/HG03366.final.cram"
]
# seems usafe since assignment is position based but I cannot figure out how to do this nicely
df_triads.loc[df_triads["ENA_FILE_PATH"].isna(), "ENA_FILE_PATH"] = url_list
df_triads

Unnamed: 0,triadID,sampleID,role,pgx_id,sex,ENA_FILE_PATH
0,0,HG00403,father,onekgbs-HG00403,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
1,0,HG00404,mother,onekgbs-HG00404,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
2,0,HG00405,child,onekgbs-HG00405,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR398/ERR398...
3,1,HG00406,father,onekgbs-HG00406,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
4,1,HG00407,mother,onekgbs-HG00407,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR324...
...,...,...,...,...,...,...
1801,600,NA20279,child,onekgbs-NA20279,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR398/ERR398...
1802,600,NA20282,mother,onekgbs-NA20282,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR323...
1803,601,NA20356,father,onekgbs-NA20356,M,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR323...
1804,601,NA20357,mother,onekgbs-NA20357,F,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR323...


In [55]:
df_triads[df_triads["ENA_FILE_PATH"].isna()]

Unnamed: 0,triadID,sampleID,role,pgx_id,sex,ENA_FILE_PATH


In [60]:
df_triads.to_csv(
    "../../data/1000Genomes/1000_genomes_triad_pedigrees.csv", 
    index=True, # include index, could be used for sbatch-array later on?
    sep=",")