In [None]:
import os
import sys
import pandas as pd
import requests, re, subprocess
from bs4 import BeautifulSoup
import datetime
import csv

import gzip
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from scipy.stats import fisher_exact

In [None]:
acmg_url = 'https://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/'

response = requests.get(acmg_url)
soup = BeautifulSoup(response.content, "html.parser")
results = soup.find(id="maincontent")

In [None]:
column_index = 2
acmg_genes = []

for row in results.find_all('tr'):
    cell = row.find_all('td')

    if len(cell) > column_index:
        gene = cell[column_index].text.strip()
        gene = re.sub(r'\([^)]*\)', '', gene).strip()
        acmg_genes.append(gene)

In [None]:
acmg_set = set(acmg_genes)

In [None]:
def fetch_endpoint(server, request, content_type):

    r = requests.get(server+request, headers={ "Accept" : content_type})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    if content_type == 'application/json':
        return r.json()
    else:
        return r.text

In [None]:
def get_gene(gene: str, grch: str='grch37'):

    # get the Ensembl server and extension
    if grch == 'grch37':
        ens_serv: str = f'http://{grch}.rest.ensembl.org/'
    elif grch == 'grch38':
        ens_serv: str = f'http://rest.ensembl.org/'
    else:
        raise Exception('Please provide a correct grch assembly! (grch37/grch38 are supported)')
    
    #connect to server and get data
    ens_ext: str = f'lookup/symbol/homo_sapiens/{gene}'
    con: str = 'application/json'

    return fetch_endpoint(ens_serv, ens_ext, con)

In [None]:
output_file = f'{wgs_mount}/regions.bed'
os.system(f'touch {output_file}')
with open(f'{output_file}', 'a') as f:
    for idx, gene in reversed(list(enumerate(acmg_set))):
        gene_json = get_gene(gene)
        chromosome = gene_json['seq_region_name']
        start = gene_json['start'] - 10000
        end = gene_json['end'] + 10000
        location = f'{chromosome}\t{start}\t{end}'
        if idx > 0:
            f.write(location)
            f.write('\n')
        else:
            f.write(location)

In [None]:
os.system(f'bcftools view --regions-file {wgs_mount}/regions.bed -Oz {wgs_mount}/gnomad.genomes.r2.1.1.sites.vcf.bgz > {wgs_mount}/gnomad.genomes.regions.r2.1.1.sites.vcf.gz')
os.system(f'bcftools index {wgs_mount}/gnomad.genomes.regions.r2.1.1.sites.vcf.gz')

In [None]:
os.system(f'bcftools query -f "%CHROM %POS %ID %REF %ALT %QUAL %FILTER %INFO/AF\n" -o {wgs_mount}/extracted_gnomad.genomes.regions.r2.1.1.sites.vcf.gz {wgs_mount}/gnomad.genomes.regions.r2.1.1.sites.vcf.gz')


In [None]:
fields = "AF,MAX_AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,Existing_variation,Allele,Consequence,Gene,NMD,SYMBOL,SYMBOL_SOURCE,IMPACT,ClinVar,ClinVar_CLNREVSTAT,ClinVar_AF_EXAC,ClinVar_CLNDISDB,ClinVar_CLNDISDBINCL,ClinVar_CLNSIG,ClinVar_CLNSIGCONF,ClinVar_CLNSIGINCL,ClinVar_CLNVC,ClinVar_GENEINFO,ClinVar_RS,gnomADg,gnomADg_AN,gnomADg_AC,gnomADg_AF,PUBMED,SIFT,PolyPhen,BIOTYPE,Feature_type,Feature,CLIN_SIG,MANE_SELECT"

Delete MANE Select - and then also from other places!!

In [None]:
os.system(f'docker run -i -v {wgs_mount}:/data ensemblorg/ensembl-vep vep \
--offline -i /data/gnomad_annotation/xai.vcf --assembly GRCh37 --vcf --af --max_af --af_1kg \
--fields {fields} --per_gene -o /data/out_xai.vcf --force_overwrite --mane --fork 5 --plugin NMD \
--custom /data/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNREVSTAT,AF_EXAC,CLNDISDB,CLNDISDBINCL,CLNSIG,CLNSIGCONF,CLNSIGINCL,CLNVC,GENEINFO,RS \
--custom file=/data/gnomad.genomes.regions.r2.1.1.sites.vcf.gz,short_name=gnomADg,format=vcf,type=exact,coords=0,fields=AN%AC%AF%AF_afr%AF_amr%AF_asj%AF_eas%AF_fin%AF_nfe%AF_oth')

In [None]:
column_names = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'] # INFO_AF ??

In [None]:
df1 = pd.read_csv('gnomad_annotation/annotated_mane/out_combined_below5.vcf', sep='\t', names=column_names)
df1['INFO'] = pd.to_numeric(df1['INFO'], errors='coerce')   # double check the INFO - it will probably had to be .split by ';' or sth
print('df loaded')
df_5 = df1[(df1['INFO'] <= 0.05) | (df1['INFO'].isna())] #(df1['AF'] == '-') | (df1['AF'].isna()) | (df1['AF'] == '.')]
print('below 5 done')
df_5.to_csv('gnomad_annotation/annotated_mane/below5_11.csv', index=False, header=False)

In [None]:
df_clinvar = pd.read_csv(f'clinvar/variant_summary.txt', sep='\t')

df_clinvar = df_clinvar[df_clinvar['Assembly'] != 'GRCh38']
df_clinvar['POS_ID'] = df_clinvar.apply(lambda row: f"{row['Chromosome']}-{row['PositionVCF']}-{row['ReferenceAlleleVCF']}-{row['AlternateAlleleVCF']}", axis=1)


In [None]:
df_gnom = pd.read_csv(f'gnomad_annotation/annotated_mane/out_gnomad_mane_add_annot_done.csv', sep=',', names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'])
df_gnom['POS_ID'] = df_gnom.apply(lambda row: f"{row['CHROM']}-{row['POS']}-{row['REF']}-{row['ALT']}", axis=1)
merged_df = pd.merge(df_gnom, df_clinvar, on='POS_ID', how='left')
merged_df.to_csv(f'gnomad_annotation/annotated_mane/clinvar_add_annot.csv', index=False, header=False)

In [None]:
columns = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','POS_ID','AlleleID','Type','Name','GeneID','GeneSymbol','HGNC_ID','ClinicalSignificance','ClinSigSimple','LastEvaluated','RS# (dbSNP)','nsv/esv (dbVar)','RCVaccession','PhenotypeIDS','PhenotypeList','Origin','OriginSimple','Assembly','ChromosomeAccession','Chromosome','Start','Stop','ReferenceAllele','AlternateAllele','Cytogenetic','ReviewStatus','NumberSubmitters','Guidelines','TestedInGTR','OtherIDs','SubmitterCategories','VariationID','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']

In [None]:
df_clin = pd.read_csv(f'gnomad_annotation/annotated_mane/clinvar_add_annot.csv', sep=',', names=columns)
final_clin_df = df_clin[df_clin['INFO'].str.contains('HIGH') | df_clin['ClinicalSignificance'].str.contains('Pathogenic') | df_clin['ClinicalSignificance'].str.contains('Pathogenic/Likely pathogenic') | df_clin['ClinicalSignificance'].str.contains('Likely pathogenic')]
final_clin_df.to_csv(f'gnomad_annotation/annotated_mane/fin_clin_add_annot.csv', index=False, header=False)

In [None]:
df = pd.read_csv('gnomad_annotation/combined_clin.csv', sep=',', names=columns)
ids = df['ID']
ids.to_csv('gnomad_annotation/rs_ids_combined_clin.txt', index=False, header=False)

In [None]:
with open('gnomad_annotation/rs_ids_combined_clin.txt', 'r') as file:
    lines = file.readlines()

filtered_lines = [line.strip() for line in lines if '.' not in line]

with open('gnomad_annotation/rs_ids.txt', 'w') as file:
    for line in filtered_lines:
        file.write(line + '\n')

In [None]:
df = pd.read_csv('gnomad_annotation/combined_clin.csv', sep=',', names=columns)
filtered_df = df[df['ID'] == '.']
filtered_values = filtered_df[['CHROM', 'POS']]
filtered_values.to_csv('gnomad_annotation/test_rs_dots.txt', sep='\t', index=False, header=False)

In [None]:
os.system(f"bcftools view -i ID=@gnomad_annotation/rs_ids.txt gnomad_annotation/gnomad.genomes.regions.r2.1.1.sites.vcf.gz > gnomad_annotation/gnomad_zip_rows_tomatch.vcf")

In [None]:
os.system(f"bcftools view -R gnomad_annotation/test_rs_dots.txt gnomad_annotation/gnomad.genomes.regions.r2.1.1.sites.vcf.gz > gnomad_annotation/gnomad_zip_rows_tomatch_dots.vcf")

In [None]:
def read_genotyped_file(path_to_file: str, separators=["\t", ","], possible_number_of_cols = [4,5],
                        possible_header_starts = ["RSID", "rsid", "#CHROM", "rsID"]) -> pd.DataFrame:

# first we handle the excell file
    if path_to_file.endswith('xlsx'):
        return pd.read_excel(path_to_file, engine='openpyxl')

    with open(path_to_file, 'r') as f:
        header = None
        skip_pattern = None
        for line in f:
            if line.startswith("["):
                skip_pattern = True
                break

            if line.startswith('#'):
                header = line
            else:
                if header is None:
                    header = line
                break # stop when there are no more
    
    if skip_pattern:
        return pd.read_csv(path_to_file, sep="\t", skiprows=10)

    is_header = any([(header_start in header) for header_start in possible_header_starts])

    if is_header:

        for separator in separators:
            header_names = header[1:]
            header_names = header_names.strip().split(separator)
            if len(header_names)==1:
                continue

            data = pd.read_csv(path_to_file, sep=separator, comment='#', names=header_names)

            if data.shape[1] in possible_number_of_cols:
                break

    else:

        for separator in separators:
            
            data = pd.read_csv(path_to_file, sep=separator, comment='#')

            if data.shape[1] in possible_number_of_cols:
                break

    return data

In [None]:
df_gnomad = read_genotyped_file(f'gnomad_annotation/gnomad_zip_rows_tomatch.vcf')
df_gnomad_2 = read_genotyped_file(f'gnomad_annotation/gnomad_zip_rows_tomatch_dots.vcf')

In [None]:
combined_df = pd.concat([df_gnomad, df_gnomad_2], ignore_index=True)

In [None]:
df_clin = pd.read_csv('gnomad_annotation/annotated_mane/combined_mane_clin.csv', sep=',', names=columns)

In [None]:
def extract_info(row, chosen_fields):
    csq_info = row['CSQ'].split('CSQ=')[1]
    csq_fields = csq_info.split('|')
    # print(len(csq_fields))
    chosen_fields = [csq_fields[i] if csq_fields[i] else '-' for i in chosen_fields]
    return pd.Series(chosen_fields)

def process_dataframe(df, chosen_indices, new_column_names):
    new_columns = df.apply(extract_info, args=(chosen_indices,), axis=1)
    new_columns.columns = new_column_names

    # concatenate the new columns with the original df
    df = pd.concat([df.loc[:, :'CSQ'], new_columns, df.loc[:, 'CSQ':]], axis=1)
    df = df.loc[:, ~df.columns.duplicated()]

    return df

In [None]:
df = pd.read_excel('gnomad_annotation/annotated_mane/combined_mane_clin_test.xlsx')
df = df[df['Assembly'] != 'GRCh38']

chosen_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
new_column_names = ['AF','MAX_AF','AFR_AF','AMR_AF','EAS_AF','EUR_AF','SAS_AF','Existing_variation','Allele','Consequence','Gene','NMD','SYMBOL','SYMBOL_SOURCE','IMPACT','ClinVar','ClinVar_CLNREVSTAT','ClinVar_AF_EXAC','ClinVar_CLNDISDB','ClinVar_CLNDISDBINCL','ClinVar_CLNSIG','ClinVar_CLNSIGCONF','ClinVar_CLNSIGINCL','ClinVar_CLNVC','ClinVar_GENEINFO','ClinVar_RS','gnomADg','gnomADg_AN','gnomADg_AC','gnomADg_AF','PUBMED','SIFT','PolyPhen','BIOTYPE','Feature_type','Feature','CLIN_SIG','MANE_SELECT']

# Process the dataframe to extract the new columns
# new_columns, source_column_series = process_dataframe(df, chosen_indices, new_column_names)
df_all = process_dataframe(df, chosen_indices, new_column_names)

# Combine with original dataframe
# df_combined = pd.concat([df.reset_index(drop=True), new_columns.reset_index(drop=True), source_column_series.reset_index(drop=True)], axis=1)

# Filter out rows with 'NMD_escaping_variant' in NMD column
# filtered_df = df_combined[df_combined['NMD'] != 'NMD_escaping_variant']
filtered_df = df_all[df_all['NMD'] != 'NMD_escaping_variant']

# Save to Excel
filtered_df.to_excel('gnomad_annotation/annotated_mane/gnomad_mane_test_final_1.xlsx', index=False)

In [None]:
df = pd.read_excel('gnomad_annotation/annotated_mane/gnomad_mane_test_final.xlsx')

def extract_population_freq(info_string, population):
    pattern = f"AC_{population}=([0-9]+);AN_{population}=([0-9]+)"
    match = re.search(pattern, info_string)
    if match:
        ac, an = match.groups()
        return float(ac), float(an)
    else:
        return None, None
    
populations = ['amr', 'asj', 'afr', 'eas', 'fin', 'nfe_nwe', 'nfe_seu', 'nfe_onf', 'nfe_est']   # 'nfe'
for population in populations:
    ac_values = []
    an_values = []
    for index, row in df.iterrows():
        ac, an = extract_population_freq(row['INFO'], population)   # CHECK LATER IF IT SHOULDN'T BE CHANGED INTO INFO_x
        ac_values.append(ac)
        an_values.append(an)
    df[f'AC_{population}'] = ac_values
    df[f'AN_{population}'] = an_values
    
for population in populations:
    df[f'Frequency_{population}'] = df[f'AC_{population}'] / df[f'AN_{population}']
df.to_excel('gnomad_annotation/annotated_mane/gnomad_mane_populations_final.xlsx', index=False)

### Statistical analysis

In [None]:
df = pd.read_excel('gnomad_mane_populations_final.xlsx')

In [None]:
df = df.drop_duplicates()

# Save the cleaned DataFrame to a new .xlsx file
output_path = 'cleaned_file.xlsx'  # Replace with your desired output file path
df.to_excel(output_path, index=False)

In [None]:
df = pd.read_excel('cleaned_file.xlsx')

# Remove duplicate rows based on the POS_ID column
df_cleaned = df.drop_duplicates(subset=['POS_ID'])

# Save the cleaned DataFrame to a new .xlsx file
output_path = 'cleaned_file_posid_dups.xlsx'  # Replace with your desired output file path
df_cleaned.to_excel(output_path, index=False)

Leave one of the drop_duplicates in the final clean version !!!

In [None]:
df = pd.read_excel('cleaned_file_posid_dups.xlsx')

In [None]:
file_path = 'cleaned_file_posid_dups.xlsx'  # Replace with your actual file path
df = pd.read_excel(file_path)

# Remove duplicate rows based on the POS_ID column
# df_cleaned = df.drop_duplicates(subset=['POS_ID'])

# Filter rows where SubmitterCategories is 2 or more, or is NaN
df_filtered = df[(df['NumberSubmitters'] >= 2) | (df['NumberSubmitters'].isna())]

df_filtered_ag = df_filtered[df_filtered['ClinicalSignificance'] != 'Conflicting interpretations of pathogenicity']

# Save the filtered DataFrame to a new .xlsx file
output_path = 'filtered_more_submits.xlsx'  # Replace with your desired output file path
df_filtered_ag.to_excel(output_path, index=False)

In [None]:
df = pd.read_excel('filtered_more_submits.xlsx')

In [None]:
file_path = 'filtered_more_submits.xlsx'  # Replace with your actual file path
df = pd.read_excel(file_path)

lst = ['Pathogenic','Pathogenic/Likely pathogenic','Likely pathogenic','Pathogenic; drug response','Likely pathogenic; drug response']
# Remove duplicate rows based on the POS_ID column
# df_cleaned = df.drop_duplicates(subset=['POS_ID'])

# Filter rows where SubmitterCategories is 2 or more, or is NaN
# df_filtered = df[(df['NumberSubmitters'] >= 2) | (df['NumberSubmitters'].isna())]

df_filtered_ag = df_filtered.loc[(df_filtered['ClinicalSignificance'].isin(lst)) | (df_filtered['ClinicalSignificance'].isna()),]

# Save the filtered DataFrame to a new .xlsx file
output_path = 'filtered_more_significance.xlsx'  # Replace with your desired output file path
df_filtered_ag.to_excel(output_path, index=False)

In [None]:
df = pd.read_excel('filtered_more_significance.xlsx')

In [None]:
file_path = 'filtered_more_submits.xlsx'  # Replace with your actual file path
df = pd.read_excel(file_path)

lst = ['Pathogenic','Pathogenic/Likely pathogenic','Likely pathogenic','Pathogenic; drug response','Likely pathogenic; drug response']
# Remove duplicate rows based on the POS_ID column
# df_cleaned = df.drop_duplicates(subset=['POS_ID'])

# Filter rows where SubmitterCategories is 2 or more, or is NaN
# df_filtered = df[(df['NumberSubmitters'] >= 2) | (df['NumberSubmitters'].isna())]

df_filtered_ag = df_filtered.loc[(df_filtered['ClinicalSignificance'].isin(lst)),]

# Save the filtered DataFrame to a new .xlsx file
output_path = 'filtered_more_significance_test.xlsx'  # Replace with your desired output file path
df_filtered_ag.to_excel(output_path, index=False)

In [None]:
df = pd.read_excel('filtered_more_significance_test.xlsx')

In [None]:
grouped_df = df.groupby('SYMBOL')

In [None]:
an_columns = ['AN_amr', 'AN_asj', 'AN_afr', 'AN_eas', 'AN_fin', 'AN_nfe_nwe', 'AN_nfe_seu', 'AN_nfe_est', 'AN_nfe_onf']  # 'AN_nfe'
ac_columns = ['AC_amr', 'AC_asj', 'AC_afr', 'AC_eas', 'AC_fin', 'AC_nfe_nwe', 'AC_nfe_seu', 'AC_nfe_est', 'AC_nfe_onf']  # 'AC_nfe'

# symbols = ['PKP2', 'BTD', 'MLH1']

# Initialize dictionaries to store median and mode results
median_results_an = {}
mode_results_an = {}
# median_results_ac = {}
# mode_results_ac = {}
sums_ac = {}
ratios_mode = {}
ratios_median = {}

# Iterate over each AN column
for column in an_columns:
    # Calculate the median of the current AN column within each group
    median_results_an[column] = df.groupby('SYMBOL')[column].median().reset_index()
    
    # Calculate the mode of the current AN column within each group
    # Note: Mode can have multiple values, so we'll use a lambda function to join them into a single string
    mode_results_an[column] = df.groupby('SYMBOL')[column].agg(lambda x: ','.join(x.mode().astype(str))).reset_index()

for column in ac_columns:
    # Calculate the median of the current AN column within each group
    # median_results_ac[column] = df.groupby('SYMBOL')[column].median().reset_index()
    
    # Calculate the mode of the current AN column within each group
    # Note: Mode can have multiple values, so we'll use a lambda function to join them into a single string
    # mode_results_ac[column] = df.groupby('SYMBOL')[column].agg(lambda x: ','.join(x.mode().astype(str))).reset_index()

    sums_ac[column] = df.groupby('SYMBOL')[column].sum().reset_index()

print('good')
results_median = []
results_mode = []

for key in sums_ac.keys():
    ac_df = sums_ac[key]
    an_df = median_results_an[key.replace("AC", "AN")]
    
    # Merge the DataFrames on 'SYMBOL' column
    merged_df = pd.merge(ac_df, an_df, on='SYMBOL')
    
    # Perform the division operation
    merged_df[f'AC/AN_ratio_{key}'] = merged_df[key] / merged_df[key.replace("AC", "AN")]
    
    # Append the result to the list
    results_median.append(merged_df)
print('good')

for key in sums_ac.keys():
    # print(key)
    ac_df = sums_ac[key]
    # print(ac_df)
    an_df = mode_results_an[key.replace("AC", "AN")]

    # Function to find the maximum value from a list of strings
    # def max_from_list(lst):
    #     if isinstance(lst, list):
    #         return max(map(float, lst))
    #     else:
    #         return float(lst)

    # print(ac_df)

    # ac_df[key] = ac_df[key].apply(lambda x: max(map(int, x.split(','))) if ',' in x else int(x))
    an_df[key.replace("AC", "AN")] = an_df[key.replace("AC", "AN")].apply(lambda x: max(map(int, x.split(','))) if ',' in x else int(x))

    # an_max = an_df.apply(lambda x: max_from_list(x.split(',')), axis=1)
    # ac_max = ac_df.apply(lambda x: max_from_list(x.split(',')), axis=1)

    # print(ac_max)

    # print(an_df)
    # Merge the DataFrames on 'SYMBOL' column
    merged_df = pd.merge(ac_df, an_df, on='SYMBOL')
    
    # Perform the division operation
    merged_df[f'AC/AN_ratio_{key}'] = merged_df[key] / merged_df[key.replace("AC", "AN")]
    
    # Append the result to the list
    results_mode.append(merged_df)
print('good')

In [None]:
df_dict_median = {}

for df in results_median:
    for column in df.columns:
        if column.startswith('AN_'):
            column = column.lstrip('AN_')
            df_dict_median[column] = df
            break

df_dict_mode = {}

for df in results_mode:
    for column in df.columns:
        if column.startswith('AN_'):
            column = column.lstrip('AN_')
            df_dict_mode[column] = df
            break

In [None]:
full_names_pop = {'afr':'African','amr':'Latino','asj':'Ashkenazi Jews', 'eas':'East Asian','fin':'Finnish','nfe_nwe':'North-Western European','nfe_seu':'Southern European','nfe_onf':'Other Non-Finnish European','nfe_est':'Estonian'} # 'nfe':'Non-Finnish European'

In [None]:
population_info = {}

for population, df in df_dict_median.items():
    filtered_df = df[df['SYMBOL'].isin(acmg_genes)]
    # Extract relevant columns from the dataframe
    population_df = filtered_df[['SYMBOL', f'AC_{population}', f'AN_{population}']]

    population_df[f'AN-AC_{population}'] = filtered_df[f'AN_{population}'] - filtered_df[f'AC_{population}']
    
    # Convert the dataframe to a list of dictionaries
    population_list = population_df.to_dict(orient='records')
    
    # Store the list of dictionaries in the population_info dictionary
    population_info[population] = population_list

In [None]:
fisher_results = {}

# Iterate through each SYMBOL
for symbol in acmg_genes:
    if symbol not in [data_dict['SYMBOL'] for data_list in population_info.values() for data_dict in data_list]:
        continue  # Skip the symbol if it has no entries

    # Initialize an empty dictionary to store results for the current symbol
    symbol_results = {}
    
    # Generate all pairs of populations
    population_pairs = combinations(population_info.keys(), 2)
    
    # Iterate through each pair of populations
    for population_a, population_b in population_pairs:
        # Extract data for the current populations
        data_a = population_info[population_a]
        data_b = population_info[population_b]
        
        # Initialize contingency table
        contingency_table = [[], []]
        
        # Fill contingency table with AC and AN-AC values for the current symbol
        for data_dict_a, data_dict_b in zip(data_a, data_b):
            if data_dict_a['SYMBOL'] == symbol and data_dict_b['SYMBOL'] == symbol:
                # print(symbol)
                contingency_table[0].append(data_dict_a[f'AC_{population_a}'])
                contingency_table[1].append(data_dict_b[f'AC_{population_b}'])
                contingency_table[0].append(data_dict_a[f'AN-AC_{population_a}'])
                contingency_table[1].append(data_dict_b[f'AN-AC_{population_b}'])
        
        # print(symbol, population_a, population_b)
        # if symbol == '':
        #     print(population_a, population_b)
        # if len(contingency_table) == '':
        #     print(population_a, population_b)
        
        # Perform Fisher's exact test
        odds_ratio, p_value = fisher_exact(contingency_table)
        
        # Store results for the current pair of populations
        symbol_results[(population_a, population_b)] = {'odds_ratio': odds_ratio, 'p_value': p_value}
    
    # Store Fisher's exact test results for the current symbol
    fisher_results[symbol] = symbol_results

In [None]:
groups_dict = {}

# Open the text file
with open('genes phenotypes groups.txt', 'r') as file:
    current_group = None
    for line in file:
        # Remove leading and trailing whitespace from the line
        line = line.strip()
        # Check if the line is empty
        if not line:
            # If the line is empty, it indicates a new group
            current_group = None
        else:
            # If the line is not empty
            if current_group is None:
                # If current_group is None, it indicates a new group
                current_group = line
                # Initialize a list for the current group
                groups_dict[current_group] = []
            else:
                # If current_group is not None, append the symbol to the list of the current group
                groups_dict[current_group].append(line)

# Display the dictionary
print(groups_dict)

In [None]:
results_grouped_dict = {}

for population, df in df_dict_median.items():
    # Initialize dictionaries to store sums of AC and medians of AN for each group
    ac_sums = {}
    ac_minus_an_median = {}
    
    # Iterate through each group of symbols
    for group, symbols in groups_dict.items():
        # Filter the DataFrame to include only symbols in the current group
        group_df = df[df['SYMBOL'].isin(symbols)]
        
        # Calculate the sum of AC values for the current group
        ac_sum = group_df[f'AC_{population}'].sum()
        ac_sums[group] = ac_sum
        
        # Calculate the median of AN values for the current group
        # an_median = group_df[f'AN_{population}'].median()
        ac_minus_an_median[group] = group_df[f'AN_{population}'].median() - group_df[f'AC_{population}'].sum()
        # an_medians[group] = ac_minus_an_median
    
    # Store the results for the current population
    results_grouped_dict[population] = {'AC_sums': ac_sums, f'AN-AC_{population}': ac_minus_an_median}

# Display the results
print(results_grouped_dict)

In [None]:
fisher_results_grouped = {}

# Iterate through each group
for group_name in list(results_grouped_dict[next(iter(results_grouped_dict))]['AC_sums'].keys()):
    fisher_results_grouped[group_name] = {}
    
    # Iterate through each pair of populations
    for population_pair in combinations(results_grouped_dict.keys(), 2):
        population_a, population_b = population_pair
        
        # Extract AC_sums and AN-AC values for population A
        ac_sum_a = results_grouped_dict[population_a]['AC_sums'][group_name]
        an_minus_ac_a = results_grouped_dict[population_a]['AN-AC_' + population_a][group_name]
        
        # Extract AC_sums and AN-AC values for population B
        ac_sum_b = results_grouped_dict[population_b]['AC_sums'][group_name]
        an_minus_ac_b = results_grouped_dict[population_b]['AN-AC_' + population_b][group_name]
        
        # Construct the contingency table
        contingency_table = [
            [ac_sum_a, an_minus_ac_a],
            [ac_sum_b, an_minus_ac_b]
        ]
        
        # Perform Fisher's exact test
        odds_ratio, p_value = fisher_exact(contingency_table)
        
        # Store the results
        fisher_results_grouped[group_name][population_pair] = {'odds_ratio': odds_ratio, 'p_value': p_value}

# Display the Fisher's exact test results
print(fisher_results_grouped)

In [None]:
for symbol in acmg_genes:
    if symbol in fisher_results.keys():
        total_ac_an_ratio = 0
        for column, median_df in df_dict_median.items():
            pop = column
            symbol_median_df = median_df[median_df['SYMBOL'] == symbol]
            if not symbol_median_df.empty:
                total_ac_an_ratio += symbol_median_df[f'AC/AN_ratio_AC_{pop}'].iloc[0]

        # Skip plotting if total AC/AN ratio is zero
        if total_ac_an_ratio == 0:
            continue

        plt.figure(figsize=(15, 17))
        plt.title(f'Frequency of individuals with a variant present in {symbol} gene', fontsize=20)
        plt.xlabel('Populations', fontsize=16)
        plt.ylabel('Frequency', fontsize=16)

        for i, (column, median_df) in enumerate(df_dict_median.items()):
            pop = column
            symbol_median_df = median_df[median_df['SYMBOL'] == symbol]
            plt.bar(i, symbol_median_df['AC/AN_ratio_AC_' + pop], alpha=0.8, label=f'{column}')

        # Adding horizontal lines representing p-values between different population pairs
        for i, ((pop1, pop2), p_value_data) in enumerate(fisher_results[symbol].items()):
            if p_value_data['p_value'] < 0.05:
                p_value = p_value_data['p_value']
                x1 = list(df_dict_median.keys()).index(pop1)
                x2 = list(df_dict_median.keys()).index(pop2)
                y = max(plt.ylim())
                plt.plot([x1, x2], [y, y], linewidth=2, color='black')  # Horizontal line
                p_value_label = f'{p_value:.4f}' if p_value >= 1e-4 else f'<{0.0001}'
                plt.text((x1 + x2) / 2, y, f'p-value: {p_value_label}', ha='center', va='bottom', fontsize=12)

        plt.xticks(range(len(df_dict_median)), [full_names_pop[pop] for pop in df_dict_median.keys()], rotation=-45, fontsize=12)

        plt.tight_layout()  # Adjust layout to fit all elements
        plt.savefig(f'mane_plots_significance/freq_fisher/{symbol}.jpg')
        # plt.savefig(f'mane_plots_significance_test/freq_fisher/{symbol}.jpg')
        plt.close()

In [None]:
disease_ratios = {}

for disease, symbols in groups_dict.items():
    disease_df = pd.DataFrame()
    for symbol in symbols:
        symbol_data = pd.concat([data[data['SYMBOL'] == symbol] for _, data in df_dict_median.items()])
        # print(symbol_data)
        disease_df = pd.concat([disease_df, symbol_data])
    
    # Calculate sum of AC values and median of AN values for each population
    disease_ratios[disease] = {}
    for population, data in df_dict_median.items():
        # print(population)
        population_subset = disease_df[disease_df['SYMBOL'].isin(symbols)]
        ac_sum = population_subset[f'AC_{population}'].sum()
        an_median = population_subset[f'AN_{population}'].median()
        # Calculate new AC/AN ratio
        if an_median != 0:
            ratio = ac_sum / an_median
        else:
            ratio = None  # Handle case where denominator is 0
        disease_ratios[disease][population] = ratio

print(disease_ratios)

In [None]:
for disease, ratios_per_population in disease_ratios.items():
    plt.figure(figsize=(15, 20))
    
    # Title and labels with font size adjustments
    plt.title(f'Frequency of individuals with a variant present in genes\nrelated to {disease}', fontsize=20, pad=20)
    plt.xlabel('Populations', fontsize=16)
    plt.ylabel('Frequency', fontsize=16)

    for i, (population, ratio) in enumerate(ratios_per_population.items()):
        plt.bar(i, ratio, alpha=0.8, label=f'{population}')
    
    # Adding horizontal lines representing p-values between different population pairs
    if disease in fisher_results_grouped:
        for i, ((pop1, pop2), p_value_data) in enumerate(fisher_results_grouped[disease].items()):
            if p_value_data['p_value'] < 0.05:
                p_value = p_value_data['p_value']
                x1 = list(ratios_per_population.keys()).index(pop1)
                x2 = list(ratios_per_population.keys()).index(pop2)
                y = max(plt.ylim())
                plt.plot([x1, x2], [y, y], linewidth=2, color='black')  # Horizontal line
                p_value_label = f'{p_value:.4f}' if p_value >= 1e-4 else f'<{0.0001}'
                plt.text((x1 + x2) / 2, y, f'p-value: {p_value_label}', ha='center', va='bottom', fontsize=12)

    plt.xticks(range(len(ratios_per_population)), [full_names_pop[pop] for pop in ratios_per_population.keys()], rotation=-45, fontsize=12)

    # Adjust layout to fit all elements
    plt.tight_layout()  # Adjust rect to leave more space for the title

    plt.savefig(f'mane_plots_significance/freq_fisher_grouped/{disease}.jpg')
    # plt.savefig(f'mane_plots_significance_test/freq_fisher_grouped/{disease}.jpg')
    plt.close()