In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import re

### Get list of snp ID by fabian format 
file = "gnomad/SNP_annot_allExons_fabian_gnomadV3_all.tsv"
dic_fabian_id = {}

with open(file) as f:
    for line in f:
        dic_fabian_id[line.strip().split('\t')[0]] = line.strip().split('\t')[1]

In [None]:
file = Path("K562_REF_sequences_raw.tsv")

dic_gain = {}
dic_loss= {}
with open(file) as f:
    for line in f:
        start = int((line.strip().split()[2]))
        end = int((line.strip().split()[3]))
        name = line.strip().split()[0]
        dic_gain[name] =0
        dic_loss[name] =0

        try:
            df = pd.read_csv("/mnt/project/exonhancer/ZENODO_REPO/gnomAD_v3/1_compute_fabian/pipeline_exons/fabian_result/"+name+".vcf_fabian_bilan.tsv", index_col=0, sep='\t')
            for column_name in df.columns:  
                coord_fabian_tmp = (column_name.split(":")[1]).split(">")[0]
                coord_fabian = [int(x) for x in re.findall(r'\d+', coord_fabian_tmp)] 
                ref = [x for x in re.findall(r'[a-zA-Z]+', coord_fabian_tmp)][0]
                alt = (column_name.split(">")[1]).split(".")[0]

                if len(ref)>1 or len(alt)>1: #remove indel
                    continue
 
                ### Check if strong synonymous SNP
                if int(coord_fabian[0]) > start and int(coord_fabian[0]) <= end: #Mutation can be in sequence #> start because start is not included
                    try: #If not in the dictionnary it means it didn't pass the gnomAD filter 
                        if dic_fabian_id[column_name.split('.')[0]] == "synonymous": #Mutation is synonymous
                            df[column_name] = df[column_name].astype(float)
                            if any(df[column_name] > 0.66) :
                                dic_gain[name] += 1
                            if any(df[column_name] < -0.66): #Mutation is strong:
                                dic_loss[name] += 1
                                
                    except KeyError:
                        continue

            if dic_gain[name] == 0 and dic_loss[name] == 0:
                del dic_loss[name]
                del dic_gain[name]
                                        
        except FileNotFoundError:
            if dic_gain[name] == 0 and dic_loss[name] == 0:
                del dic_loss[name]
                del dic_gain[name]
            continue

content = []
for key, value in dic_loss.items():
    if value >= dic_gain[key]:
        content.append([key,"LOSS"])
    else:
        content.append([key,"GAIN"])
        
df1 = pd.DataFrame(content)
df_unique = df1.drop_duplicates()
df_unique.to_csv("K562_Dic_SnpStrongType.tsv",sep="\t",header=False,index=False)