## 1. Identification of orthologue proteins

In [None]:
# Importar bibliotecas
import pandas as pd

In [None]:
# Importando arquivo
ortogrupos = pd.read_csv("../orthofinder/Orthogroups/Orthogroups.tsv", sep="\t")
# Removendo NAs, extraindo pares de ortologs
ortologos_zh = ortogrupos.dropna()

________

## 2. Protein alignment and statistics

In [4]:
# Importando as bibliotecas necessárias
from Bio import SeqIO, Align
from Bio.Align import substitution_matrices
import glob
import pandas as pd
import os
from itertools import combinations

In [5]:
# Função para ler o arquivo Orthologues.tsv e criar a lista de nomes
def ler_orthologues_tsv(orthologues_path):
    orthologues_df = pd.read_csv(orthologues_path, sep='\t')
    orthologues_df.dropna(inplace=True)
    return list(orthologues_df.iloc[:, 0])

# Função para carregar sequências em um arquivo FASTA para SeqRecord
def carregar_seqrecord(fasta_path):
    return list(SeqIO.parse(fasta_path, 'fasta'))

In [6]:
# Definindo os diretórios
orthologues_dir = '../orthofinder/Orthogroups'
orthogroup_sequences_dir = '../orthofinder/Orthogroup_Sequences'

In [7]:
# Definindo parametros do alinhamento
matrix = substitution_matrices.load("BLASTP")
aligner = Align.PairwiseAligner()
aligner.substitution_matrix = matrix

In [6]:
# Ler os ortogrupos de interesse
orthogroups = ler_orthologues_tsv(os.path.join(orthologues_dir, 'Orthogroups.tsv'))

# Inicializar o dataframe vazio
df_results = []

# Carregar as sequências de cada ortogrupo e alinhar os pares de sequências
for orthogroup in orthogroups:
    orthogroupSeqRecord = carregar_seqrecord(os.path.join(orthogroup_sequences_dir, f'{orthogroup}.fa'))

    results = []
    combination_list = list(combinations(orthogroupSeqRecord,2))

    for i in range(len(combination_list)):
        alignments = aligner.align(combination_list[i][0], combination_list[i][1])
        alignment = alignments[0]
        
        results.append({
        "orthogroup" : f'{orthogroup}',
        "query": alignment.query.id,
        "target_id": alignment.target.id,
        "per_identity": round((alignment.counts()[1] * 100)/alignment.length, 2),
        "per_similarity": round((alignment.counts()[2] * 100)/alignment.length, 2),
        "length": alignment.length,
        "score": round(alignment.score, 2),
        "gaps": alignment.counts()[0],
        "identities": alignment.counts()[1],
        "mismatches": alignment.counts()[2]
    })
    
    df_results.append(pd.DataFrame(results))
    
dfFinal = pd.concat(df_results)
dfFinal.to_csv('../results/orthogroup_alignments.tsv', sep='\t', index=False)

## 3. Seleção de alvos em epilepsia

In [35]:
# Importar bibliotecas
import pandas as pd
from unipressed import IdMappingClient
import time

In [36]:
#Loading data
epilepsy_genes = pd.read_excel("../data/EpilepsyGene_1482_IDs_EGNF.xlsx")
epilepsy_genes.drop(columns=["UniProt_ID"], inplace=True)
epilepsy_genes["Entrez_ID"] = epilepsy_genes["Entrez_ID"].astype(str)

# Getting the Entrez IDs and converting them to UniProt IDsa
entrez_epi = epilepsy_genes["Entrez_ID"].tolist()

In [37]:
len(entrez_epi)

1482

In [38]:
# Getting matches IDs
request = IdMappingClient.submit(
    source="GeneID", dest="UniProtKB", ids={str(x) for x in entrez_epi}
)
time.sleep(1)
results_list = list(request.each_result())

In [39]:
entrez2uniprot = pd.DataFrame(results_list)
entrez2uniprot.rename(columns={"from": "Entrez_ID", "to": "UniProt_ID"}, inplace=True)
entrez2uniprot.to_csv("../results/entrez2uniprot.epilepsy.tsv", sep="\t", index=False)
# entrez2uniprot.groupby("Entrez_ID")['UniProt_ID'].apply(lambda x: ','.join(x)).reset_index()
# entrez2uniprot['UniProt_ID'] = entrez2uniprot['UniProt_ID'].apply(lambda x: [x for x in x.split(',')])
# epilepsy_ids = epilepsy_genes.merge(entrez2uniprot)
# epilepsy_ids = epilepsy_ids.explode("UniProt_ID")

In [14]:
uniprot_epi = epilepsy_ids['UniProt_ID'].to_list()

In [15]:
# Loading the orthologues file
df_orthologues = pd.read_csv('../orthofinder/Orthologues/Orthologues_Human.UniProt.renamed/Human.UniProt.renamed__v__Zebrafish.UniProt.renamed.tsv', sep='\t')

# Manipulating dataframe to filter only the epilepsy genes
df_orthologues['Human.UniProt.renamed'] = df_orthologues['Human.UniProt.renamed'].apply(lambda x: [x for x in x.split(',')])
df_orthologues['Zebrafish.UniProt.renamed'] = df_orthologues['Zebrafish.UniProt.renamed'].apply(lambda x: [x for x in x.split(',')])
df_orthologues = df_orthologues.explode('Human.UniProt.renamed').explode('Zebrafish.UniProt.renamed')

In [16]:
orthologues_epi = df_orthologues[df_orthologues['Human.UniProt.renamed'].isin(uniprot_epi)]

In [30]:
orthologues_epi.nunique()

Orthogroup                   1261
Human.UniProt.renamed        1300
Zebrafish.UniProt.renamed    1505
dtype: int64

In [34]:
orthologues_epi

NameError: name 'orthologues_epi' is not defined

## 5. Paralogue analysis

In [12]:
import pandas as pd
from itertools import combinations

In [None]:
#Loeading the orthologues file
df_orthologues = pd.read_csv('../orthofinder/Orthologues/Orthologues_Human.UniProt.renamed/Human.UniProt.renamed__v__Zebrafish.UniProt.renamed.tsv', sep='\t')

#Separating paralogous genes in human
hs_genes = df_orthologues["Human.UniProt.renamed"].str.strip().apply(lambda x: x.split(',')).explode().reset_index(drop=True).to_list()
hs_genes = [x.strip() for x in hs_genes]

#Separating paralogous genes in zebrafish
zb_genes = df_orthologues["Zebrafish.UniProt.renamed"].str.strip().apply(lambda x: x.split(',')).explode().reset_index(drop=True).to_list()
zb_genes = [x.strip() for x in zb_genes]

In [33]:
df = pd.read_table('../results/orthogroup_alignments.tsv')

#Separating paralogous genes in dataframes
hs_paralogues = df[(df['query'].isin(hs_genes)) & (df['target_id'].isin(hs_genes))]
zb_paralogues = df[(df['query'].isin(zb_genes)) & (df['target_id'].isin(zb_genes))]

#Separating orthologous genes in dataframes
zb_hs_orthologues = df[(df['query'].isin(zb_genes)) & (df['target_id'].isin(hs_genes))]

#Saving dataframes
hs_paralogues.to_csv('../results/hs_paralogues.all.tsv', sep='\t', index=False)
zb_paralogues.to_csv('../results/zb_paralogues.all.tsv', sep='\t', index=False)
zb_hs_orthologues.to_csv('../results/zb_hs_orthologues.all.tsv', sep='\t', index=False)