[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/broadinstitute/g2papi/blob/main/examples/ex1_chaperones/chaperone_stats.ipynb)


In [None]:
!pip install g2papi

# Downlad Pharos Chaperone Query Results
!wget -O "query_results.csv" "https://raw.githubusercontent.com/broadinstitute/g2papi/main/examples/ex1_chaperones/query%20results.csv"


In [None]:
import g2papi

# G2P3D API: Get gene-transcript-protein isoform-protein structure map as a pandas dataframe
gene_transcript_protein_isoform_struct = g2papi.get_gene_transcript_protein_isoform_structure('BRCA1', 'P38398')
print(gene_transcript_protein_isoform_struct[['UniProt Isoform','Ensembl Transcript Id', 'RefSeq mRNA Id']].head())


In [None]:
import pandas as pd

chaperones = pd.read_csv("query_results.csv")

chaperones.head()

## Identify number of chaperones where canonical protein isoform maps to MANEselect Transcript

In [None]:
from tqdm.notebook import tqdm
import time
transcript_results_df = pd.DataFrame()

print(chaperones.shape)

for i, chaperone in tqdm(chaperones.iterrows()):
    #print(chaperone)
    uniprot = chaperone["UniProt"]
    gene = chaperone["Symbol"]
    
    #print(i, uniprot, gene)
    
    try:
        time.sleep(1)
        map_df = g2papi.get_gene_transcript_protein_isoform_structure(gene, uniprot)
        #print(map_df)
    except Exception as e:
        print(f"API failed for {gene}, {uniprot}", e)
        continue
    
    map_df["gene"] = gene
    
    transcript_results_df = pd.concat([transcript_results_df, map_df])   


In [None]:
transcript_results_df.to_csv("chaperone_transcript_results.csv")

In [None]:
import pandas as pd
transcript_results_df = pd.read_csv("chaperone_transcript_results.csv")
print(transcript_results_df.shape)

transcript_results_df.head()

In [None]:
from tqdm.notebook import tqdm
matching_refseq = 0
uniprots = []

for i, chaperone in tqdm(chaperones.iterrows()):
    #print(chaperone)
    uniprot = chaperone["UniProt"]
    
    transcripts = transcript_results_df[transcript_results_df["UniProtKB"] == uniprot]
    
    #print(transcripts)

    
    canonical_protein_rows = transcripts[transcripts["UniProt Isoform"].str.contains(r"\*", na=False)]
    #print(canonical_protein_rows)
    
    mane_select = canonical_protein_rows[canonical_protein_rows["RefSeq mRNA Id"].str.contains("MANE", na=False)]
    #1/0
    if(mane_select.shape[0] > 0):
        uniprots.append(uniprot)
        matching_refseq += 1
        
print(matching_refseq)



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# barplot chaperones with/without matching refseq. 2 bars, one for with refseq, one for without
non_matching_refseq = len(chaperones) - matching_refseq

sns.barplot(x=["MANE-select match", "MANE-select mismatch"], y=[matching_refseq, non_matching_refseq])

# annotate the bars with the numbers
plt.text(0, matching_refseq, str(matching_refseq), ha='center', va='bottom')
plt.text(1, non_matching_refseq, str(non_matching_refseq), ha='center', va='bottom')

plt.title("Chaperones with MANE-selected RefSeq on Canonical Protein Isoform")
plt.ylabel("# Chaperones")

plt.show()


## Chaperone Structure Characterization

In [None]:
all_protein_features_df = pd.DataFrame()

for i, chaperone in tqdm(chaperones.iterrows()):
    uniprot = chaperone["UniProt"]
    gene = chaperone["Symbol"]
    
    try:
        protein_features_df = g2papi.get_protein_features(gene, uniprot)
        protein_features_df["gene"] = gene
        protein_features_df["uniprot"] = uniprot
        all_protein_features_df = pd.concat([all_protein_features_df, protein_features_df])
        #print(map_df)
    except Exception as e:
        print(f"API failed for {gene}, {uniprot}", e)
        continue


all_protein_features_df.to_csv("chaperone_protein_features.csv")

In [None]:
all_protein_features_df = pd.read_csv("chaperone_protein_features.csv")
# piechart of the value_counts from the "Secondary structure (DSSP 3-state)*" column

# piechart of the value_counts from the "Secondary structure (DSSP 3-state)*" column
plt.pie(all_protein_features_df["Secondary structure (DSSP 3-state)*"].value_counts(), labels=all_protein_features_df["Secondary structure (DSSP 3-state)*"].value_counts().index, autopct='%1.1f%%')
plt.title("Secondary structure of Chaperone Proteins")
plt.show()
