In [None]:
'''
This remakes the clustered heatmap for the Mau genes. the catagories are by the presence of MauA, MauE and MauG regardless
overlap i.e if a species has more MauA and MauE then its counted as 1 for both. 
'''

In [None]:
import pandas as pd
from IPython.display import display
import os
import re
import Bio
from Bio import Entrez
import sys, errno, re, json, ssl
from urllib import request
from urllib.error import HTTPError
from time import sleep
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import chi2_contingency

In [None]:
#this is modified from my data fetcher to add scientific names to my csv columns
tax_LOT = pd.read_pickle("C:\PATH\ncbi_2025_taxonomy_table.pkl")
print(tax_LOT)
email = 'example@gmail.com' 




def fetch_TAXID_data(tax_id, email):
    classification = tax_id
    Entrez.email = email 
    retry_limit = 3  

    attempts = 0
    while attempts < retry_limit:
        try:
            handle = Entrez.efetch(db="taxonomy", id=str(classification), retmode="xml")
            records = Entrez.read(handle)
            for rec in records:
                NAME = rec.get("ScientificName")
            break 
        except Exception as e:
            print(f"HTTP error: {str(e)} - Sleeping 10s and retrying for ID {classification}")
            sleep(10)
            attempts += 1  
        finally:
            if 'handle' in locals() and handle:
                handle.close() 
    if attempts == retry_limit:
        print(f"Failed after {retry_limit} retries for ID {classification}")

    return NAME



def fetch_LINEAGE_NAME_data(lineage_list, email):
    Entrez.email = email  
    lineage_df = {}
    
    for classification in lineage_list:  # same as first
        try:
            NAME_series = tax_LOT[tax_LOT['tax_id'] == classification]['name_txt']
            if not NAME_series.empty:
                NAME = NAME_series.iloc[0]
                if NAME:
                    lineage_df[NAME] = classification
                    print(f'NAME: {NAME}, Tax ID: {classification}')
                else:
                    print(f'No rank available for Tax ID: {classification}')
            else:
                print(f'No entry found for Tax ID: {classification}, fetching from NCBI...')
                NAME = fetch_TAXID_data(classification, email)
                if NAME and NAME != "No NAME available":
                    lineage_df[NAME] = classification
                else:
                    print(f'Failed to fetch data for Tax ID: {classification} from NCBI.')
        except Exception as e:
            print(f'Error processing Tax ID {classification}: {str(e)}')

    return lineage_df

# testing everything still works (0 fails):
list_tax = ['1', '0', '2624677', '1644055', '2157', '1236']
email = 'your_email@example.com'  # Replace with your actual email
print(fetch_LINEAGE_NAME_data(list_tax, email))

In [None]:
#load the false positive filtered species data
species_summary_df = pd.read_excel(r"C:\PATH\5_Filtered_FBDS_IPRcounts_byspecies_MAUA_names.xlsx")

In [None]:
species_summary_df2 = species_summary_df.fillna(0)
print(species_summary_df2)

In [None]:

#getting counts of IPR combos

total_species = species_summary_df2['species'].count()

IPR009908_containing = (species_summary_df2['IPR009908'] > 0) 

IPR026259_containing =(species_summary_df2['IPR026259'] > 0)

IPR036560_containing = (species_summary_df2['IPR036560'] > 0)



In [None]:
print(species_summary_df2)


phylum_total = species_summary_df2.groupby('phylum')['species'].nunique()
phylum_ipr009908 = species_summary_df2.loc[IPR009908_containing].groupby('phylum')['species'].nunique()
phylum_ipr026259 = species_summary_df2.loc[IPR026259_containing].groupby('phylum')['species'].nunique()
phylum_ipr036560 = species_summary_df2.loc[IPR036560_containing].groupby('phylum')['species'].nunique()


# Create a new DataFrame 
phylum_summary_df = pd.DataFrame({
    'total_count': phylum_total,
    'IPR009908': phylum_ipr009908,
    'IPR026259': phylum_ipr026259,
    'IPR036560': phylum_ipr036560
})


phylum_summary_df.reset_index(inplace=True)
phylum_summary_df2 = phylum_summary_df.fillna(0)

# Display the new DataFrame
display(phylum_summary_df2)

In [None]:
phylum_summary_df3 = phylum_summary_df2.copy()
IPR009908_list =[]
IPR036560_list=[]
IPR026259_list=[]


for ind in phylum_summary_df3.index:
    percent_IPR009908 = (phylum_summary_df3['IPR009908'][ind]/phylum_summary_df3['total_count'][ind])*100
    percent_IPR036560 = (phylum_summary_df3['IPR036560'][ind]/phylum_summary_df3['total_count'][ind])*100
    percent_IPR026259 = (phylum_summary_df3['IPR026259'][ind]/phylum_summary_df3['total_count'][ind])*100
    

    IPR009908_list.append(percent_IPR009908)
    IPR036560_list.append(percent_IPR036560)
    IPR026259_list.append(percent_IPR026259)
    

phylum_summary_df3['perc_IPR009908'] = IPR009908_list
phylum_summary_df3['perc_IPR026259']= IPR026259_list
phylum_summary_df3['perc_IPR036560']= IPR036560_list

display(phylum_summary_df3)

In [None]:
#this section is prone to failing because of NCBI http errors. so I doubled it up and put try and except loops
#just manually fill any gaps left over. for my data set it pulled all 167 rows but it took a while. 
#it just adds the scientific names instead of id
phylum_summary_df4 = phylum_summary_df3
names = []
for ind in phylum_summary_df4.index:
    if phylum_summary_df4['phylum'][ind] == 0:
        name = 'no_phylum'
        names.append(name)
    else:
        try:
            tax_id = phylum_summary_df4['phylum'][ind]
            list_ind = [int(tax_id)] #I made the function accept a list which was good for the Interpro pull but annoying here 
            print(list_ind)
            name_df = fetch_LINEAGE_NAME_data(list_ind,email)
            temp_name_ls = list(name_df.keys())
            print(temp_name_ls[0])
            names.append(temp_name_ls[0])
        except:
            try:
                tax_id = phylum_summary_df4['phylum'][ind]
                list_ind = [int(tax_id)] #I made the function accept a list which was good for the Interpro pull but annoying here 
                print(list_ind)
                name_df = fetch_LINEAGE_NAME_data(list_ind,email)
                temp_name_ls = list(name_df.keys())
                print(temp_name_ls[0])
                names.append(temp_name_ls[0])
            except:
                continue
        
                
print(names)

In [None]:
#adding in the names
phylum_summary_df4['scientific_names']=names
display(phylum_summary_df4)

In [None]:
#phylum_summary_df4.to_excel("C:\PATH\MAUgenes_3_cat_3markers_percents_names.xlsx")
phylum_summary_df4 = pd.read_excel(r"C:\PATH\MAUgenes_3_cat_3markers_percents_names.xlsx")
phylum_summary_df4 = phylum_summary_df4.drop('Unnamed: 0', axis=1) #do this if your just loading the saved dataframe again

In [None]:
phylum_percentages=phylum_summary_df4[['perc_IPR009908', 'perc_IPR026259', 'perc_IPR036560', 'scientific_names']].copy()

In [None]:
#removing candidatus phylums
phylum_no_candidatus = phylum_summary_df4.copy()
phylum_no_candidatus = phylum_no_candidatus[~phylum_no_candidatus['scientific_names'].str.contains('candidatus', case=False, na=False)]
display(phylum_no_candidatus)

In [None]:
phylum_percentages2=phylum_no_candidatus[['perc_IPR009908', 'perc_IPR026259', 'perc_IPR036560', 'scientific_names']].copy()

In [None]:
#removing candidatus and no phylum or candidate so its just he concrete phylum
phylum_no_candidatus2 = phylum_no_candidatus[~phylum_no_candidatus['scientific_names'].str.contains('candidatus', case=False, na=False)]
phylum_no_candidatus3 = phylum_no_candidatus2[~phylum_no_candidatus2['scientific_names'].str.contains('no_phylum', case=False, na=False)]
phylum_no_candidatus4 = phylum_no_candidatus3[~phylum_no_candidatus3['scientific_names'].str.contains('candidate', case=False, na=False)]
phylum_no_candidatus4.set_index('scientific_names', inplace=True)
phylum_no_candidatus5 = phylum_no_candidatus4.copy()
phylum_no_candidatus6 = phylum_no_candidatus5[
    (phylum_no_candidatus5['total_count'] >= 10)
]

In [None]:
#phylum_no_candidatus6.to_excel("C:\PATH\MAUgenes_3cat_3mark_perc_no_can_10more_names.xlsx")
phylum_no_candidatus6 = pd.read_excel(r"C:\PATH\MAUgenes_3cat_3mark_perc_no_can_10more_names.xlsx")
phylum_no_candidatus6 = phylum_no_candidatus6.set_index('scientific_names') #do this if your loading in

In [None]:
display(phylum_no_candidatus6)


In [None]:
phylum_no_candidatus7=phylum_no_candidatus6[['perc_IPR036560','perc_IPR009908', 'perc_IPR026259' ]].copy()
g = sns.clustermap(phylum_no_candidatus7,figsize=(7,15),cmap='rocket_r',col_cluster=False)


In [None]:
g.savefig(r'C:\PATH\MauGenes_3.png', dpi='figure',  bbox_inches='tight' )
