# Unified Method using a UniProt ID and Species Names (P04637 in this work)

# 1. Libraries and functions import

In [50]:
import nbimporter
import pandas as pd
import libraries_functions as lf

# 2. Configuring GraphDB connection

In [51]:
sparql_inparanoid = lf.configure_sparql("https://semantics.inf.um.es:7200/repositories/orthoxml","admin", "minegraph")
sparql_oma = lf.configure_sparql("https://sparql.omabrowser.org/sparql")

# 3. Enter the species and Uniprot ID for the search of ortholog genes

In [58]:
species1 = input("Enter the first species for which you want to search for ortholog genes: \n")
uniprot_id = input("Enter the Uniprot ID of the first especies for the orthologs you want to search for in the second species: \n"); 
species2 = input("Enter the second species for which you want to search for ortholog genes: \n");

lf.check_species(species1)
if uniprot_id.isupper() is False:
    print("Error. Enter a valid Uniprot ID in capital letters")
lf.check_species(species2)

Enter the first species for which you want to search for ortholog genes: 
Homo sapiens
Enter the Uniprot ID of the first especies for the orthologs you want to search for in the second species: 
P04637
Enter the second species for which you want to search for ortholog genes: 
Mus musculus


In [60]:
results_inparanoid = lf.query_InParanoid(sparql_inparanoid, uniprot_id, species1, species2)
results_inparanoid['source'] = 'InParanoid'
results_oma = lf.query_OMA(sparql_oma, uniprot_id, species1, species2)
results_oma['source'] = 'OMA'

combined_df = pd.concat([results_oma, results_inparanoid], ignore_index=True)
combined_df

Unnamed: 0,UniProt ID 1,Species 1,UniProt ID 2,Species 2,source
0,P04637,Homo sapiens,A0A158SIS7,Mus musculus,OMA
1,P04637,Homo sapiens,O70366,Mus musculus,OMA
2,P04637,Homo sapiens,P02340,Mus musculus,OMA
3,P04637,Homo sapiens,P53_MOUSE,Mus musculus,OMA
4,P04637,Homo sapiens,Q549C9,Mus musculus,OMA
5,P04637,Homo sapiens,Q91XH8,Mus musculus,OMA
6,P04637,Homo sapiens,P02340,Mus musculus,InParanoid


# 4. Counting orthologous gene ocurrences in each database 

In [37]:
ortholog_counts = combined_df['UniProt ID 2'].value_counts().reset_index()
ortholog_counts.columns = ['UniProt ID 2', 'Count']

# 5. Add column to check curated proteins

In [61]:
ortholog_counts['Curated'] = ortholog_counts['UniProt ID 2'].apply(lf.is_curated)
ortholog_counts

Unnamed: 0,UniProt ID 2,Count,Curated,Reliability score
0,P02340,2,True,100.0
3,P53_MOUSE,1,True,75.0
1,A0A158SIS7,1,False,25.0
2,Q91XH8,1,False,25.0
4,O70366,1,False,25.0
5,Q549C9,1,False,25.0


# 6. Counting number of databases and applying score

In [62]:
#Count databases number
num_databases = 2


#Apply score function
ortholog_counts['Reliability score'] = ortholog_counts.apply(lf.assign_score, total_databases=num_databases, axis=1)
ortholog_counts

Unnamed: 0,UniProt ID 2,Count,Curated,Reliability score
0,P02340,2,True,100.0
3,P53_MOUSE,1,True,75.0
1,A0A158SIS7,1,False,25.0
2,Q91XH8,1,False,25.0
4,O70366,1,False,25.0
5,Q549C9,1,False,25.0


# 7. Sorting results in descending order

In [63]:
#Ordenar los resultados en orden descendente
ortholog_counts = ortholog_counts.sort_values(by='Reliability score', ascending=False)

#Mostrar los resultados finales
ortholog_counts

Unnamed: 0,UniProt ID 2,Count,Curated,Reliability score
0,P02340,2,True,100.0
3,P53_MOUSE,1,True,75.0
1,A0A158SIS7,1,False,25.0
2,Q91XH8,1,False,25.0
4,O70366,1,False,25.0
5,Q549C9,1,False,25.0
