# Experimentally obtained structure analysis
* How many genes or mutations with clusters are and are not covered by experimentally obtained structures? 
* Quantify the added value of using AlphaFold predicted structures in the prediction of driver genes and mutations

In [46]:
import pandas as pd
import requests

In [None]:
!wget -O pdb_chain_uniprot.tsv.gz "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_uniprot.tsv.gz"
# !wget -O uniprot_pdb.tsv.gz "https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_pdb.tsv.gz"
# !wget -O UP000000226_3885.dat.gz "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000226/UP000000226_3885.dat.gz"
# !wget -O UP000000226_3885.idmapping.gz "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000000226/UP000000226_3885.idmapping.gz"

In [62]:
# uniprot_proteome_dat = pd.read_table("UP000000226_3885.dat.gz", header=None, sep="\t")
# display(uniprot_proteome_dat)
uniprot_proteome_mapping = pd.read_table("UP000000226_3885.idmapping.gz", header=None, sep="\t")
display(uniprot_proteome_mapping)

Unnamed: 0,0
0,ID V7ACU5_PHAVU Unreviewed; ...
1,AC V7ACU5;
2,"DT 19-FEB-2014, integrated into UniProtKB/Tr..."
3,"DT 19-FEB-2014, sequence version 1."
4,"DT 05-FEB-2025, entry version 40."
...,...
2403537,MRALAARFSS YLYRRKIGVN TRSRNFSSYS GKEELSIE...
2403538,GYHFFPYMGE NLMQQSVSLL RVRDPLFKRV GASRLTRF...
2403539,LATAKDDRTR KAALQALDAL SHSDEALASM HNAGAISI...
2403540,RFQDLRYDVP S


Unnamed: 0,0,1,2
0,A0JJX6,CRC64,0B6C43B3D3D14A1F
1,A0JJX6,eggNOG,ENOG502S7BC
2,A0JJX6,EMBL,AM410092
3,A0JJX6,EMBL-CDS,CAL68581.1
4,A0JJX6,EMBL-CDS,ESW09810.1
...,...,...,...
628609,V7D3K8,UniParc,UPI0003CA55B1
628610,V7D3K8,UniProtKB-ID,V7D3K8_PHAVU
628611,V7D3K8,UniRef100,UniRef100_V7D3K8
628612,V7D3K8,UniRef50,UniRef50_I1JVU1


## Get a dataframe for PDB to Uniprot mapping
* Keep only A chain
* One row for each PDB structure continuous sequence (the same PDB structure might be split into two rows if there is a gap in the specific Uniprot isoform (not clear how the specific Uniprot isoform is defined))
* Reference canonical Uniprot ID and number of residues of its sequence that are covered by the structure
* Resolution: filter it by keeping only structures with a resolution < 3 (discuss it)

__TODO:__ 
* Map gene name to Uniprot ID
* Start by looking at the genes detected by Oncodrive3D
* Among these genes, how many genes have at least a pdb structure?
    * Divide these into CGC and non-CGC
* Among these genes, how many detected clusters are in a pdb structure?
    * Divide these into CGC and non-CGC
* Among these genes, how many clank are in a pdb structure?
    * Either at least one cluster of the clank, or all of them

In [66]:
pdb_uniprot = pd.read_table("pdb_chain_uniprot.tsv.gz", skiprows=1)
display(pdb_uniprot)
# uniprot_pdb = pd.read_table("uniprot_pdb.tsv.gz", skiprows=1)
# display(uniprot_pdb)

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,101m,A,P02185,1,154,0,153,1,154
1,102l,A,P00720,1,40,1,40,1,40
2,102l,A,P00720,42,165,41,,41,164
3,102m,A,P02185,1,154,0,153,1,154
4,103l,A,P00720,1,40,1,,1,40
...,...,...,...,...,...,...,...,...,...
863074,9xia,A,P24300,1,388,1,,1,388
863075,9xim,A,P12851,1,393,,394,2,394
863076,9xim,B,P12851,1,393,,394,2,394
863077,9xim,C,P12851,1,393,,394,2,394


Unnamed: 0,SP_PRIMARY,PDB
0,A0A003,6kvc;6kv9
1,A0A009I821,7ryg;7m4w;7ryf;7uw1;7m4y;7m4x;7uvz;7uvw;7uvv;7...
2,A0A009IHW8,8g83;7uwg;7uxu
3,A0A009PZ93,9kbq
4,A0A009QSN8,6v3b;6v3a;6v39;6v3d
...,...,...
67671,X5K3J9,6az6
67672,X5KVH4,6mfk
67673,X5MEI1,7mqd;7mpl;7mpq;7mqf;7mqc;7mqb;7mqg;7mqi;7mpo;7...
67674,X7EBZ8,8dqm


In [71]:
pdb_uniprot = pdb_uniprot[pdb_uniprot["CHAIN"] == "A"]
pdb_uniprot = pdb_uniprot[["PDB", "SP_PRIMARY", "SP_BEG", "SP_END"]].rename(
    columns={"SP_PRIMARY" : "Uniprot_ID", "SP_BEG" : "Start", "SP_END": "End"}).reset_index(drop=True)
pdb_uniprot

Unnamed: 0,PDB,Uniprot_ID,Start,End
0,101m,P02185,1,154
1,102l,P00720,1,40
2,102l,P00720,41,164
3,102m,P02185,1,154
4,103l,P00720,1,40
...,...,...,...,...
215496,9rsa,P61823,27,150
215497,9rub,P04718,1,466
215498,9wga,P02876,28,198
215499,9xia,P24300,1,388


In [78]:
# Retrieve PDB resolution
pdb_ids = pdb_uniprot["PDB"].unique()
api_url = "https://data.rcsb.org/rest/v1/core/entry/"

def get_resolution(pdb_id):
    try:
        response = requests.get(api_url + pdb_id)
        response.raise_for_status()
        data = response.json()
        resolution = data.get("rcsb_entry_info", {}).get("resolution_combined", [None])
        return resolution[0] if resolution else None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for PDB ID {pdb_id}: {e}")
        return None

resolutions = {pdb_id: get_resolution(pdb_id) for pdb_id in pdb_ids}
pdb_uniprot["Resolution"] = pdb_uniprot.PDB.map(resolutions)

Error fetching data for PDB ID 1ba3: ('Connection aborted.', TimeoutError(110, 'Connection timed out'))
Error fetching data for PDB ID 1brw: ('Connection aborted.', TimeoutError(110, 'Connection timed out'))


In [None]:
pdb_uniprot.to_csv("pdb_uniprot.tsv.gz", sep="\t", index=False, compression="gzip")

0         2.07
1         1.74
2         1.74
3         1.84
4         1.90
          ... 
215496     NaN
215497     NaN
215498     NaN
215499     NaN
215500     NaN
Name: PDB, Length: 215501, dtype: float64

In [None]:
pdb_uniprot[pdb_uniprot["Uniprot_ID"] == "P04637"]

In [None]:
pdb_uniprot