In [1]:
import pandas as pd
import numpy as np
from Bio.Seq import Seq
from Bio.Align import PairwiseAligner
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import os
import pickle

In [2]:
variants = pd.read_pickle('00_dataframes/cDNA_protSeq_first_vars')

In [3]:
with open('prot_seqs_folding.txt', 'w') as o:
    for sc, gene, alt, ref in zip(variants['shortcut'], variants['HUGO_Symbol'], 
                                  variants['termin_prot'], variants['ref_protein_seq']):
        o.write('>%s\n%s\n>%s\n%s\n'%(sc,alt,gene,ref))

In [4]:
# import InterProScan TSVs as df-dictionary
files = os.listdir('03_for_cDNA/00_ip_scans')
ip_dfs = {}
for file in files:
    df = pd.read_csv(('03_for_cDNA/00_ip_scans/'+file), sep='\t', 
                      names=['seq', 'ID', 'prot_len', 'where_data_from', 'domain_short', 'info_domain', 'start_pos',
                             'end_pos', 'probab', 'sth', 'date', 'ipr_id', 'more_info', 'go_id', 'many'], 
                      usecols=['where_data_from', 'domain_short', 'info_domain', 'start_pos', 'end_pos',
                                'ipr_id', 'more_info', 'go_id'])
    ip_dfs[file.split('_')[1].split('.')[0].upper()] = df.sort_values(by=['start_pos', 'end_pos']
                                                                     ).reset_index(drop=True)

In [41]:
#pickle.dump(ip_dfs, open('00_dataframes/ips_dict', 'wb'))

In [5]:
# check, which domains are effected
def check_domains(ind):
    domains = []
    gene = variants.iloc[ind]['HUGO_Symbol']
    pos_prot_change = variants.iloc[ind]['prot_change_pos']
    if gene in ip_dfs.keys() and type(pos_prot_change) == int:
        ip_scan = ip_dfs[variants.iloc[ind]['HUGO_Symbol']]
        for s,e,dom in zip(ip_scan['start_pos'], ip_scan['end_pos'], ip_scan['domain_short']):
            if pos_prot_change >= s and pos_prot_change <= e:
                domains.append(dom)
    return domains

# check, which domains are deleted
def deleted_domains(ind):
    domains = []
    gene = variants.iloc[ind]['HUGO_Symbol']
    pos_prot_change = variants.iloc[ind]['prot_change_pos']
    termin_prot_len = variants.iloc[ind]['termin_prot_len']
    if gene in ip_dfs.keys() and type(pos_prot_change) == int:
        ip_scan = ip_dfs[variants.iloc[ind]['HUGO_Symbol']]
        for s,e,dom in zip(ip_scan['start_pos'], ip_scan['end_pos'], ip_scan['domain_short']):
            if termin_prot_len <= s:
                domains.append(dom)
    return domains

# new columns with effected/deleted domains
variants['domains_effected'] = [check_domains(i) for i in variants.index]
variants['domains_deleted'] = [deleted_domains(i) for i in variants.index]

In [7]:
#variants.to_pickle('03_for_cDNA/vars_prot_ips')

### FASTA file with protein sequences/IDs

In [None]:
[i for i,t,r in zip(variants.index, variants['termin_prot'], variants['ref_protein_seq']) if abs(len(r)-len(t))<5]

In [None]:
# create fasta file with protein sequences and IDs
with open('03_for_cDNA/fasta_form_first_seq.fa', 'w') as o:
    for i in range(len(variants)):
        o.write('>%s alt\n%s\n'%(variants.iloc[i]['ID_variant'], variants.iloc[i]['termin_prot']))
        o.write('>%s ref\n%s\n'%(variants.iloc[i]['ID_variant'], variants.iloc[i]['ref_protein_seq']))

### Alignments

In [None]:
seq_ref1 = Seq(variants.iloc[9]['ref_protein_seq'][:50])
seq_alt1 = Seq(variants.iloc[9]['termin_prot'])

In [None]:
aligner = PairwiseAligner()
aligner.mode = 'local'
alignments = aligner.align(seq_ref1, seq_alt1)
len(alignments)

In [None]:
aligner.score(seq_alt1, seq_ref1)

In [None]:
print(aligner)

In [None]:
for al in alignments:
    print(al)

In [None]:
align1 = pairwise2.align.globalxx(seq_ref1, seq_alt1)

In [None]:
len(variants.iloc[9]['termin_prot'])

In [None]:
pairwise2.align.alignment_function()