In [1]:
import bioregistry.pandas as brpd
import pandas as pd

# Focus on these columns when displaying the data
columns = [0, 1, 4, 5, 7, 12]

df = pd.read_csv(
    "http://geneontology.org/gene-associations/goa_human.gaf.gz",
    sep="\t",
    comment="!",
    header=None,
    dtype=str,
).head(100)


df[columns].head()

Unnamed: 0,0,1,4,5,7,12
0,UniProtKB,A0A024RBG1,GO:0003723,GO_REF:0000043,UniProtKB-KW:KW-0694,taxon:9606
1,UniProtKB,A0A024RBG1,GO:0046872,GO_REF:0000043,UniProtKB-KW:KW-0479,taxon:9606
2,UniProtKB,A0A024RBG1,GO:0005829,GO_REF:0000052,,taxon:9606
3,UniProtKB,A0A075B6H7,GO:0002250,GO_REF:0000043,UniProtKB-KW:KW-1064,taxon:9606
4,UniProtKB,A0A075B6H7,GO:0005886,GO_REF:0000044,UniProtKB-SubCell:SL-0039,taxon:9606


## Prefixes

In [2]:
idx = brpd.validate_prefixes(df, column=0)

brpd.summarize_prefix_validation(df, idx)

100 of 100 (100%) rows with the following prefixes need to be fixed: ['UniProtKB']
The following prefixes could be normalized using normalize_curies():

| raw       | standardized   |
|-----------|----------------|
| UniProtKB | uniprot        |


In [3]:
brpd.normalize_prefixes(df, column=0)

df[columns].head()

Unnamed: 0,0,1,4,5,7,12
0,uniprot,A0A024RBG1,GO:0003723,GO_REF:0000043,UniProtKB-KW:KW-0694,taxon:9606
1,uniprot,A0A024RBG1,GO:0046872,GO_REF:0000043,UniProtKB-KW:KW-0479,taxon:9606
2,uniprot,A0A024RBG1,GO:0005829,GO_REF:0000052,,taxon:9606
3,uniprot,A0A075B6H7,GO:0002250,GO_REF:0000043,UniProtKB-KW:KW-1064,taxon:9606
4,uniprot,A0A075B6H7,GO:0005886,GO_REF:0000044,UniProtKB-SubCell:SL-0039,taxon:9606


In [4]:
idx = brpd.validate_prefixes(df, column=0)

brpd.summarize_prefix_validation(df, idx)

0 of 100 (0%) rows with the following prefixes need to be fixed: []


## CURIEs

In [5]:
idx = brpd.validate_curies(df, column=4)

brpd.summarize_curie_validation(df, idx)

100 of 100 (100%) rows with the following CURIEs need to be fixed: ['uniprot']


In [6]:
brpd.normalize_curies(df, column=4)

df[columns].head()

Unnamed: 0,0,1,4,5,7,12
0,uniprot,A0A024RBG1,go:0003723,GO_REF:0000043,UniProtKB-KW:KW-0694,taxon:9606
1,uniprot,A0A024RBG1,go:0046872,GO_REF:0000043,UniProtKB-KW:KW-0479,taxon:9606
2,uniprot,A0A024RBG1,go:0005829,GO_REF:0000052,,taxon:9606
3,uniprot,A0A075B6H7,go:0002250,GO_REF:0000043,UniProtKB-KW:KW-1064,taxon:9606
4,uniprot,A0A075B6H7,go:0005886,GO_REF:0000044,UniProtKB-SubCell:SL-0039,taxon:9606


In [7]:
idx = brpd.validate_curies(df, column=4)

brpd.summarize_curie_validation(df, idx)

0 of 100 (0%) rows with the following CURIEs need to be fixed: []


## Identifiers

In [8]:
idx = brpd.validate_identifiers(df, column=1, prefix_column=0, use_tqdm=True)
print(f"{(~idx).sum():,} rows have invalid identifiers")

0 rows have invalid identifiers


In [9]:
(~idx).sum()

0

In [10]:
brpd.identifiers_to_curies(df, column=1, prefix_column=0)

columns = [c for c in columns if c != 0]  # remove redundant column

df[columns].head()

Unnamed: 0,1,4,5,7,12
0,uniprot:A0A024RBG1,go:0003723,GO_REF:0000043,UniProtKB-KW:KW-0694,taxon:9606
1,uniprot:A0A024RBG1,go:0046872,GO_REF:0000043,UniProtKB-KW:KW-0479,taxon:9606
2,uniprot:A0A024RBG1,go:0005829,GO_REF:0000052,,taxon:9606
3,uniprot:A0A075B6H7,go:0002250,GO_REF:0000043,UniProtKB-KW:KW-1064,taxon:9606
4,uniprot:A0A075B6H7,go:0005886,GO_REF:0000044,UniProtKB-SubCell:SL-0039,taxon:9606
