## 1. Identificação de proteínas ortólogas

In [None]:
# Importar bibliotecas
import pandas as pd

In [None]:
# Importando arquivo
ortogrupos = pd.read_csv("../orthofinder/Orthogroups/Orthogroups.tsv", sep="\t")
# Removendo NAs, extraindo pares de ortologs
ortologos_zh = ortogrupos.dropna()


________

## 2. Caracterização de proteínas

In [None]:
# Importando as bibliotecas necessárias
from Bio import SeqIO, Align
from Bio.Align import substitution_matrices
import glob
import pandas as pd
import os
from itertools import combinations

In [None]:
# Função para ler o arquivo Orthologues.tsv e criar a lista de nomes
def ler_orthologues_tsv(orthologues_path):
    orthologues_df = pd.read_csv(orthologues_path, sep='\t')
    orthologues_df.dropna(inplace=True)
    return list(orthologues_df.iloc[:, 0])

# Função para carregar sequências em um arquivo FASTA para SeqRecord
def carregar_seqrecord(fasta_path):
    return list(SeqIO.parse(fasta_path, 'fasta'))

In [None]:
# Definindo os diretórios
orthologues_dir = '../orthofinder/Orthogroups'
orthogroup_sequences_dir = '../orthofinder/Orthogroup_Sequences'

In [None]:
# Definindo parametros do alinhamento
matrix = substitution_matrices.load("BLASTP")
aligner = Align.PairwiseAligner()
aligner.substitution_matrix = matrix

In [None]:
# Ler os ortogrupos de interesse
orthogroups = ler_orthologues_tsv(os.path.join(orthologues_dir, 'Orthogroups.tsv'))

# Inicializar o dataframe vazio
df_results = []

# Carregar as sequências de cada ortogrupo e alinhar os pares de sequências
for orthogroup in orthogroups:
    orthogroupSeqRecord = carregar_seqrecord(os.path.join(orthogroup_sequences_dir, f'{orthogroup}.fa'))

    results = []
    combination_list = list(combinations(orthogroupSeqRecord,2))

    for i in range(len(combination_list)):
        alignments = aligner.align(combination_list[i][0], combination_list[i][1])
        alignment = alignments[0]
        
        results.append({
        "orthogroup" : f'{orthogroup}',
        "query": alignment.query.id,
        "target_id": alignment.target.id,
        "per_identity": round((alignment.counts()[1] * 100)/alignment.length, 2),
        "length": alignment.length,
        "score": round(alignment.score, 2),
        "gaps": alignment.counts()[0],
        "identities": alignment.counts()[1],
        "mismatches": alignment.counts()[2]
    })
    
    df_results.append(pd.DataFrame(results))
    
dfFinal = pd.concat(df_results)
dfFinal.to_csv('../results/orthogroup_alignments.tsv', sep='\t', index=False)

## 3. Seleção de alvos em epilepsia

In [20]:
# Importar bibliotecas
import pandas as pd
from unipressed import IdMappingClient
import time

In [80]:
#Loading data
epilepsy_genes = pd.read_excel("../data/EpilepsyGene_1482_IDs_EGNF.xlsx")
epilepsy_genes.drop(columns=["UniProt_ID"], inplace=True)
epilepsy_genes["Entrez_ID"] = epilepsy_genes["Entrez_ID"].astype(str)

# Getting the Entrez IDs and converting them to UniProt IDsa
entrez_epi = epilepsy_genes["Entrez_ID"].tolist()

In [78]:
# # Getting matches IDs
# request = IdMappingClient.submit(
#     source="GeneID", dest="UniProtKB", ids={str(x) for x in entrez_epi}
# )
# time.sleep(1)
# results_list = list(request.each_result())

entrez2uniprot = pd.DataFrame(results_list)

In [79]:
entrez2uniprot.rename(columns={"from": "Entrez_ID", "to": "UniProt_ID"}, inplace=True)
entrez2uniprot.groupby("Entrez_ID")['UniProt_ID'].apply(lambda x: ','.join(x)).reset_index()
entrez2uniprot['UniProt_ID'] = entrez2uniprot['UniProt_ID'].apply(lambda x: [x for x in x.split(',')])


In [81]:
epilepsy_ids = epilepsy_genes.merge(entrez2uniprot)
epilepsy_ids = epilepsy_ids.explode("UniProt_ID")

In [88]:
uniprot_epi = epilepsy_ids['UniProt_ID'].to_list()

In [89]:
# Loading the orthologues file
df_orthologues = pd.read_csv('../orthofinder/Orthologues/Orthologues_Human.UniProt.renamed/Human.UniProt.renamed__v__Zebrafish.UniProt.renamed.tsv', sep='\t')

# Manipulating dataframe to filter only the epilepsy genes
df_orthologues['Human.UniProt.renamed'] = df_orthologues['Human.UniProt.renamed'].apply(lambda x: [x for x in x.split(',')])
df_orthologues['Zebrafish.UniProt.renamed'] = df_orthologues['Zebrafish.UniProt.renamed'].apply(lambda x: [x for x in x.split(',')])
df_orthologues = df_orthologues.explode('Human.UniProt.renamed').explode('Zebrafish.UniProt.renamed')

In [91]:
orthologues_epi = df_orthologues[df_orthologues['Human.UniProt.renamed'].isin(uniprot_epi)]

In [92]:
orthologues_epi

Unnamed: 0,Orthogroup,Human.UniProt.renamed,Zebrafish.UniProt.renamed
8,OG0000033,O43464,A0A8M9PLT5
8,OG0000033,O43464,A0A8M9QCM4
8,OG0000033,O43464,A0A8M9Q3F7
8,OG0000033,O43464,A0A8M9QCM8
8,OG0000033,O43464,A0A8M9QCV4
...,...,...,...
12952,OG0013280,Q9NPF4,Q5RHZ6
12969,OG0013297,Q14654,Q2HX26
12988,OG0013316,Q96QK1,Q6ZM34
12989,OG0013317,Q9BSJ2,Q7SYD6


In [None]:
Orthogroups.tsv


# Lista de genes canônicos relacionados à epilepsia em zebrafish
#genes_epilepsia = [...]  # Substitua pela sua lista

# Filtrar ortólogos e gerar arquivo de saída
# ... (implementar a lógica de filtragem e salvar os resultados)

## 5. Identificação e análise de duplicados

In [None]:
# Importar bibliotecas
import pandas as pd
from Bio import SeqIO
# ... (importar outras bibliotecas necessárias)

# Ler arquivos de saída do OrthoFinder
# ... (ler os arquivos necessários para identificar duplicatas)

# Identificar e comparar duplicatas
# ... (implementar a lógica de identificação e comparação)

# Salvar resultados
# ... (salvar as informações de comparação)