## 1. Identification of orthologue proteins

In [7]:
# Importar bibliotecas
import pandas as pd

In [8]:
# Importando arquivo
ortogrupos = pd.read_csv("../03-analysis/03.01-orthofinder/Orthogroups/Orthogroups.tsv", sep="\t")
# Removendo NAs, extraindo pares de ortologs
ortologos_zh = ortogrupos.dropna()

## Genomics statistics

In [None]:
df = pd.read_table('../03-analysis/03.01-orthofinder/Orthogroups/Orthogroups.tsv')
df.head()

Unnamed: 0,Orthogroup,Human.UniProt.renamed,Zebrafish.UniProt.renamed
0,OG0000000,,"A0A0G2KC95, A0A0G2KGX6, A0A0G2KLW9, A0A0G2KMQ4..."
1,OG0000001,"Q8N8Y5, Q9BWE0","A0A0G2KE12, A0A0G2KH00, A0A0G2KSD8, A0A0G2L2X0..."
2,OG0000002,P17041,"A0A0G2KI00, A0A0G2KLM8, A0A0G2KZN8, A0A0G2L199..."
3,OG0000003,"A0A2R8Y4L6, A0A2R8YED5, A6ND48, A6NDH6, A6NET4...",
4,OG0000004,"P17026, Q49A33, Q6ZT77, Q7Z3I7, Q8N3J9","A0A0R4IS41, A0A0R4IX00, A0A8M1PS56, A0A8M2BBI2..."


In [12]:
df = df.apply(lambda x: x.str.split(','))

In [17]:
# Função para categorizar com base nas condições especificadas
def categorize(row):

    if pd.notna(row['Human.UniProt.renamed']) and pd.notna(row['Zebrafish.UniProt.renamed']):
        if len(row['Human.UniProt.renamed']) == 1 and len(row['Zebra.UniProt.renamed']) == 1:
            return 'one-to-one'
        elif len(row['Zebra.UniProt.renamed']) > 1 and len(row['Human.UniProt.renamed']) > 1:
            return 'many-to-many'
        else:
            return 'one-to-many'

In [None]:
df['category'] = df.apply(categorize, axis=1)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

________

## 2. Protein alignment and statistics

In [3]:
# Importando as bibliotecas necessárias
from Bio import SeqIO, Align
from Bio.Align import substitution_matrices, AlignInfo
import glob
import pandas as pd
import os
from itertools import combinations

# Definindo os diretórios
orthologues_dir = '../orthofinder/Orthogroups'
orthogroup_sequences_dir = '../orthofinder/Orthogroup_Sequences'

In [4]:
# Função para ler o arquivo Orthologues.tsv e criar a lista de nomes
def ler_orthologues_tsv(orthologues_path):
    orthologues_df = pd.read_csv(orthologues_path, sep='\t')
    orthologues_df.dropna(inplace=True)
    return list(orthologues_df.iloc[:, 0])

# Função para carregar sequências em um arquivo FASTA para SeqRecord
def carregar_seqrecord(fasta_path):
    return list(SeqIO.parse(fasta_path, 'fasta'))

In [5]:
# Definindo parametros do alinhamento
matrix = substitution_matrices.load("BLASTP")
aligner = Align.PairwiseAligner()
aligner.substitution_matrix = matrix

In [6]:
# Ler os ortogrupos de interesse
orthogroups = ler_orthologues_tsv(os.path.join(orthologues_dir, 'Orthogroups.tsv'))
orthogroupSeqRecord = carregar_seqrecord(os.path.join(orthogroup_sequences_dir, f'{orthogroups[20]}.fa'))

In [7]:
combination_list = list(combinations(orthogroupSeqRecord,2))

alignments = aligner.align(combination_list[0][0], combination_list[0][1])
alignment = alignments[0]

In [8]:
# Ler os ortogrupos de interesse
orthogroups = ler_orthologues_tsv(os.path.join(orthologues_dir, 'Orthogroups.tsv'))

# Inicializar o dataframe vazio
df_results = []

# Carregar as sequências de cada ortogrupo e alinhar os pares de sequências
for orthogroup in orthogroups:
    orthogroupSeqRecord = carregar_seqrecord(os.path.join(orthogroup_sequences_dir, f'{orthogroup}.fa'))

    results = []
    combination_list = list(combinations(orthogroupSeqRecord,2))

    for i in range(len(combination_list)):
        alignments = aligner.align(combination_list[i][0], combination_list[i][1])
        alignment = alignments[0]
  
        results.append({
        "orthogroup" : f'{orthogroup}',
        "query_id": alignment.query.id,
        "target_id": alignment.target.id,
        "per_identity": round((alignment.counts()[1] * 100)/alignment.length, 2),
        "length": alignment.length,
        "query_start": alignment.coordinates[0][0],
        "query_end": alignment.coordinates[0][-1],
        "query_cover": ((alignment.coordinates[0][-1] - alignment.coordinates[0][0] + 1)/len(alignment.query)) * 100,
        "subject_start": alignment.coordinates[1][0],
        "subject_end": alignment.coordinates[1][-1],
        'subject_cover': ((alignment.coordinates[1][-1] - alignment.coordinates[1][0] + 1)/len(alignment.target)) * 100,
        "score": round(alignment.score, 2),
        "gaps": alignment.counts()[0],
        "identities": alignment.counts()[1],
        "mismatches": alignment.counts()[2]
    })
    
    df_results.append(pd.DataFrame(results))
    
dfFinal = pd.concat(df_results)
dfFinal.to_csv('../results/orthogroup_alignments.tsv', sep='\t', index=False)

----

## 3. Selection of epilepsy targets

In [21]:
# Importar bibliotecas
import pandas as pd
from unipressed import IdMappingClient
import time
from Bio import SeqIO

In [8]:
#Loading data
epilepsy_genes = pd.read_excel("../00-documentation/EpilepsyGene_1482_IDs_EGNF.xlsx")
epilepsy_genes.drop(columns=["UniProt_ID"], inplace=True)
epilepsy_genes["Entrez_ID"] = epilepsy_genes["Entrez_ID"].astype(str)

# Getting the Entrez IDs and converting them to UniProt IDsa
entrez_epi = epilepsy_genes["Entrez_ID"].tolist()

In [5]:
# Getting matches IDs
request = IdMappingClient.submit(
    source="GeneID", dest="UniProtKB", ids={str(x) for x in entrez_epi}
)
time.sleep(1)
results_list = list(request.each_result())

In [9]:
entrez2uniprot = pd.DataFrame(results_list)
entrez2uniprot.rename(columns={"from": "Entrez_ID", "to": "UniProt_ID"}, inplace=True)
#entrez2uniprot.to_csv("../results/entrez2uniprot.epilepsy.tsv", sep="\t", index=False)
entrez2uniprot.groupby("Entrez_ID")['UniProt_ID'].apply(lambda x: ','.join(x)).reset_index()
entrez2uniprot['UniProt_ID'] = entrez2uniprot['UniProt_ID'].apply(lambda x: [x for x in x.split(',')])
epilepsy_ids = epilepsy_genes.merge(entrez2uniprot)
epilepsy_ids = epilepsy_ids.explode("UniProt_ID")

In [9]:
entrez2uniprot = pd.read_csv("../00-documentation/entrez2uniprot.epilepsy.csv")
entrez2uniprot['Entrez_ID'] = entrez2uniprot['Entrez_ID'].astype('str')
entrez2uniprot.groupby("Entrez_ID")['UniProt_ID'].apply(lambda x: ','.join(x)).reset_index()
entrez2uniprot['UniProt_ID'] = entrez2uniprot['UniProt_ID'].apply(lambda x: [x for x in x.split(',')])
epilepsy_ids = epilepsy_genes.merge(entrez2uniprot)
epilepsy_ids = epilepsy_ids.explode("UniProt_ID")

In [10]:
uniprot_epi = epilepsy_ids['UniProt_ID'].to_list()

In [12]:
len(uniprot_epi)

2849

In [13]:
# Loading the orthologues file
df_orthologues = pd.read_csv('../03-analysis/03.01-orthofinder/Orthologues/Orthologues_Human.UniProt.renamed/Human.UniProt.renamed__v__Zebrafish.UniProt.renamed.tsv', sep='\t')

# Manipulating dataframe to filter only the epilepsy genes
df_orthologues['Human.UniProt.renamed'] = df_orthologues['Human.UniProt.renamed'].apply(lambda x: [x for x in x.split(',')])
df_orthologues['Zebrafish.UniProt.renamed'] = df_orthologues['Zebrafish.UniProt.renamed'].apply(lambda x: [x for x in x.split(',')])
df_orthologues = df_orthologues.explode('Human.UniProt.renamed').explode('Zebrafish.UniProt.renamed')

In [14]:
orthologues_epi = df_orthologues[df_orthologues['Human.UniProt.renamed'].isin(uniprot_epi)]

In [26]:
orthologues_epi.nunique()

Orthogroup                   1261
Human.UniProt.renamed        1300
Zebrafish.UniProt.renamed    1505
dtype: int64

In [34]:
filter = list(orthologues_epi['Human.UniProt.renamed'].unique())

In [36]:
selected_seqs=[]

with open("../01-data/Human.UniProt.renamed.fasta", 'r') as f:
    for s_record in SeqIO.parse(f, 'fasta'):
        name = s_record.id
        seq = s_record.seq
        if name in filter:
            selected_seqs.append(s_record)

selected_seqs1=[]
selected_seqs2=[]

for i in range(len(selected_seqs)):
    if i <= 650:
        selected_seqs1.append(selected_seqs[i])
    else:
        selected_seqs2.append(selected_seqs[i])

with open("../01-data/Human.UniProt.Epi.1.fasta", 'w') as output_file_handle:
    SeqIO.write(selected_seqs1, output_file_handle, 'fasta')
    
with open("../01-data/Human.UniProt.Epi.2.fasta", 'w') as output_file_handle:
    SeqIO.write(selected_seqs2, output_file_handle, 'fasta')

## 5. Paralogue analysis

In [12]:
import pandas as pd
from itertools import combinations

In [None]:
#Loeading the orthologues file
df_orthologues = pd.read_csv('../orthofinder/Orthologues/Orthologues_Human.UniProt.renamed/Human.UniProt.renamed__v__Zebrafish.UniProt.renamed.tsv', sep='\t')

#Separating paralogous genes in human
hs_genes = df_orthologues["Human.UniProt.renamed"].str.strip().apply(lambda x: x.split(',')).explode().reset_index(drop=True).to_list()
hs_genes = [x.strip() for x in hs_genes]

#Separating paralogous genes in zebrafish
zb_genes = df_orthologues["Zebrafish.UniProt.renamed"].str.strip().apply(lambda x: x.split(',')).explode().reset_index(drop=True).to_list()
zb_genes = [x.strip() for x in zb_genes]

In [33]:
df = pd.read_table('../results/orthogroup_alignments.tsv')

#Separating paralogous genes in dataframes
hs_paralogues = df[(df['query'].isin(hs_genes)) & (df['target_id'].isin(hs_genes))]
zb_paralogues = df[(df['query'].isin(zb_genes)) & (df['target_id'].isin(zb_genes))]

#Separating orthologous genes in dataframes
zb_hs_orthologues = df[(df['query'].isin(zb_genes)) & (df['target_id'].isin(hs_genes))]

#Saving dataframes
hs_paralogues.to_csv('../results/hs_paralogues.all.tsv', sep='\t', index=False)
zb_paralogues.to_csv('../results/zb_paralogues.all.tsv', sep='\t', index=False)
zb_hs_orthologues.to_csv('../results/zb_hs_orthologues.all.tsv', sep='\t', index=False)