# Retrieval of Orthologous Proteins between Homo sapiens and Mus musculus from an UniProt IDs sample

# 1. Libraries and functions import

In [97]:
import nbimporter
import pandas as pd
import libraries_functions as lf

# 2. Configuring GraphDB connection

In [98]:
sparql_inparanoid = lf.configure_sparql("https://semantics.inf.um.es:7200/repositories/orthoxml","admin", "minegraph")
sparql_oma = lf.configure_sparql("https://sparql.omabrowser.org/sparql")

# 3. Enter the species for the search of ortholog genes

In [104]:
species1 = input("Enter the first species for which you want to search for ortholog genes: \n")
species2 = input("Enter the second species for which you want to search for ortholog genes: \n")

lf.check_species(species1)
if uniprot_id.isupper() is False:
    print("Error. Enter a valid Uniprot ID in capital letters")
lf.check_species(species2)

Enter the first species for which you want to search for ortholog genes: 
Homo sapiens
Enter the second species for which you want to search for ortholog genes: 
Mus musculus


# 4. Obtain human IDs from UniProt

In [100]:
human_uniprot_ids = lf.get_human_uniprot_ids()
human_genes_df = pd.DataFrame(human_uniprot_ids, columns=['uniprot_id'])

# 5. Combine results

In [101]:
random_sample_df = human_genes_df.sample(n=10, random_state=42)
combined_results = []

for unip_id in random_sample_df['uniprot_id']:
    results_inparanoid = lf.query_InParanoid(sparql_inparanoid, unip_id, species1, species2)
    results_inparanoid['source'] = 'InParanoid'
    
    results_oma = lf.query_OMA(sparql_oma, unip_id, species1, species2)
    results_oma['source'] = 'OMA'
    
    combined_df = pd.concat([results_oma, results_inparanoid], ignore_index=True)
        
    combined_results.append(combined_df)

final_results_df = pd.concat(combined_results, ignore_index=True)

## Removing duplicates IDs from the same source

In [102]:
df_unique = final_results_df.drop_duplicates(subset='UniProt ID 2', keep='first')
df_unique

Unnamed: 0,source,UniProt ID 1,Species 1,UniProt ID 2,Species 2
0,OMA,B9EF68,Homo sapiens,A0A075DCB1,Mus musculus
1,OMA,B9EF68,Homo sapiens,A0A141CM25,Mus musculus
2,OMA,B9EF68,Homo sapiens,A0A141CM38,Mus musculus
3,OMA,B9EF68,Homo sapiens,A0A141CM51,Mus musculus
4,OMA,B9EF68,Homo sapiens,A0A6M8U123,Mus musculus
5,OMA,B9EF68,Homo sapiens,N0DTI5,Mus musculus
6,OMA,B9EF68,Homo sapiens,NU5M_MOUSE,Mus musculus
7,OMA,B9EF68,Homo sapiens,P03921,Mus musculus
8,OMA,B9EF68,Homo sapiens,Q3TRR5,Mus musculus
9,OMA,B9EF68,Homo sapiens,Q4JFM6,Mus musculus


# 4. Counting orthologous gene ocurrences in each database 

In [91]:
ortholog_counts = df_unique['UniProt ID 2'].value_counts().reset_index()
ortholog_counts.columns = ['UniProt ID 2', 'Count']

# 5. Add column to check curated proteins

In [92]:
ortholog_counts['Curated'] = ortholog_counts['UniProt ID 2'].apply(lf.is_curated)
ortholog_counts

Unnamed: 0,UniProt ID 2,Count,Curated
0,Q571F9,1,False
1,P03921,1,True
2,A0A141CM51,1,False
3,A0A075DCB1,1,False
4,N0DTI5,1,False
5,Q4JFM6,1,False
6,5NTC_MOUSE,1,True
7,A0A6M8U123,1,False
8,A0A494BBI8,1,False
9,G3BP1_MOUSE,1,True


# 6. Counting number of databases and applying score

In [93]:
#Count databases number
num_databases = 2


#Apply score function
ortholog_counts['Reliability score'] = ortholog_counts.apply(lf.assign_score, total_databases=num_databases, axis=1)
ortholog_counts

Unnamed: 0,UniProt ID 2,Count,Curated,Reliability score
0,Q571F9,1,False,25.0
1,P03921,1,True,75.0
2,A0A141CM51,1,False,25.0
3,A0A075DCB1,1,False,25.0
4,N0DTI5,1,False,25.0
5,Q4JFM6,1,False,25.0
6,5NTC_MOUSE,1,True,75.0
7,A0A6M8U123,1,False,25.0
8,A0A494BBI8,1,False,25.0
9,G3BP1_MOUSE,1,True,75.0


In [103]:
#Ordenar los resultados en orden descendente
ortholog_counts = ortholog_counts.sort_values(by='Reliability score', ascending=False)

#Mostrar los resultados finales
ortholog_counts

Unnamed: 0,UniProt ID 2,Count,Curated,Reliability score
9,G3BP1_MOUSE,1,True,75.0
6,5NTC_MOUSE,1,True,75.0
12,P97855,1,True,75.0
1,P03921,1,True,75.0
10,NU5M_MOUSE,1,True,75.0
14,Q3V1L4,1,True,75.0
8,A0A494BBI8,1,False,25.0
2,A0A141CM51,1,False,25.0
3,A0A075DCB1,1,False,25.0
4,N0DTI5,1,False,25.0
