# Prepare a Report for an individual patient

> Environment setup
>> Install anaconda and follow the steps in Anaconda Prompt:
>> - `conda create --name indi_report --file requirements.txt`
>> - `conda activate indi_report`
>> - `python -m ipykernel install --user --name=indi_report`

> Reference DB
>> Execute IMNGS2 / NGSToolkit on the reference samples: KORA-Healthy, KORA-Obese, and Student cohort -> Generate zOTU table
>>
>> Execute Rhea:
>> - Normalization, Alpha diverisity, and Taxonomy binning on KORA-Healthy, KORA-Obese, and Students cohort
>> - Beta diversity on KORA-Healthy and Students cohort

> Steps to follow,
> 1) Run IMNGS2 / NGSToolkit on the samples  -> Generate zOTU table
> 2) Run Rhea -- Alpha diversity, Taxonomic binning
> 3)   Get following files and move them into the input directory:
>> - alpha-diversity.tab
>> - 1.Phyla.all.tab
>> - 5.Genera.all.tab
> 4) In addtion, also store the metafile in the input directory
> 5) Execute the following codes

# Importing

In [2]:
# %matplotlib inline      # show inline plot
import os
import pandas as pd
# import random
import numpy as np 
import matplotlib
matplotlib.use('Agg')   # restrict showing inline plots 
import matplotlib.pyplot as plt 
from scipy.stats import norm
# import statistics 
import seaborn as sns
import os
import pickle
from fpdf import FPDF
from datetime import date
from datetime import datetime
from fpdf.fonts import FontFace
from PIL import Image

# Set inputs [Modify]

In [3]:
# set input directory
input_dir = 'test_data1'

# set input filenames
alpha_div_filename  = 'alpha-diversity.tab'
phylum_filename = '1.Phyla.all.tab'
genera_filename = '5.Genera.all.tab'
# this is a tab-separated file with six columns -- "OrderNumber", "SampleName", "Age", "SampleType", "SamplingDate", "Screening-ID"
# OrderNumber: 
# SampleName: Name of the samples
# Age: Number
# SampleType: Usually "Stool"
# SamplingDate: dd.mm.yy format
# Screening-ID: 
info_filename = 'info_HH.tab'

## Check ref database

In [4]:
# input files for the samples
alpha_div = os.path.join(input_dir,alpha_div_filename)
tax_phyla = os.path.join(input_dir,phylum_filename)
tax_phyla = os.path.join(input_dir,genera_filename)
info_file = os.path.join(input_dir,info_filename)

# this is a tab-separated file with two columns -- "former_name" and "Genus"
# this file contains mapping between unknown GOTUs with a name
# Don't change the name ('gotu_names.tab')
gotu_names = os.path.join(input_dir,'gotu_names.tab')

# Set reference directory
ref_db = 'ref_db'
ref_alpha = os.path.join(ref_db, 'alpha-diversity.tab')
ref_phyla = os.path.join(ref_db, '1.Phyla.all.tab')
ref_genera = os.path.join(ref_db, '5.Genera.all.tab')
ref_func = os.path.join(ref_db, 'genera_func_2_languages_2024-07-11.txt')      # Save the tab file in UTF-16 unicode
ref_clade = os.path.join(ref_db,'clade.tab')
df_alpha = pd.read_csv(ref_alpha,sep='\t').set_index('Unnamed: 0')
df_phyla = pd.read_csv(ref_phyla,sep='\t').set_index('Unnamed: 0')
df_genera = pd.read_csv(ref_genera,sep='\t').set_index('Unnamed: 0')
assert set(df_alpha.index.to_list()) == set(df_phyla.columns.to_list()), 'Sample mismatch found in alpha diversity and phyla'
assert set(df_alpha.index.to_list()) == set(df_genera.columns.to_list()), 'Sample mismatch found in alpha diversity and genera'
assert set(df_phyla.columns.to_list()) == set(df_genera.columns.to_list()), 'Sample mismatch found in phyla and genera'
kora_sample = [c for c in df_alpha.index.to_list() if c.startswith('KORA')]
std_sample = [c for c in df_alpha.index.to_list() if c.startswith('std')]
print('No. of KORA samples: {}\nNo. of Student samples: {}'.format(len(kora_sample), len(std_sample)))


No. of KORA samples: 1562
No. of Student samples: 295


## Consider the samples present in reference dataset


In [6]:
# KORA metadata: list of samples from each category
kora_metadata =os.path.join('KORA_metadata','gesamt_2032020.xlsx')
df_metadata = pd.read_excel(kora_metadata,sheet_name='Analysiert',engine='openpyxl').set_index('Aritra_sampleID')
diabet = df_metadata[df_metadata['u3tglukfasta'] >= 125].index.to_list()
obese = df_metadata[df_metadata['u3tbmi'] >= 25].index.to_list()
cancer = df_metadata[df_metadata['lca_n_sf14'] != 0].index.to_list()
healthy = set(df_metadata.index.to_list()).difference(set(diabet+obese+cancer))

# KORA diabet 
kora_diabet = list(set(kora_sample).intersection(set(diabet)))
print('No. of diabetic: {}'.format(len(kora_diabet)))
# KORA obese
kora_obese = list(set(kora_sample).intersection(set(obese)))
print('No. of obese: {}'.format(len(kora_obese)))
# KORA obese
kora_cancer = list(set(kora_sample).intersection(set(cancer)))
print('No. of cancer: {}'.format(len(kora_cancer)))
# KORA healthy
kora_healthy = list(set(kora_sample).intersection(set(healthy)))
print('No. of healthy: {}'.format(len(kora_healthy)))

# check if all the current samples are present
df_current_sample = pd.read_csv(alpha_div,sep='\t')
df_current_sample['Unnamed: 0'] = df_current_sample['Unnamed: 0'].str.replace('-withoutSpikes','')
df_current_sample = df_current_sample.set_index('Unnamed: 0')
current_sample = [c for c in df_current_sample.index if not c.startswith('KORA') and not c.startswith('std')]
df_sampleinfo = pd.read_csv(info_file,sep='\t',keep_default_na=False)
# assert set(current_sample) == set(df_sampleinfo.SampleName.to_list()), 'Mismatch found in the samples provided with the metadata!'
assert set(df_sampleinfo.SampleName.to_list()).issubset(set(current_sample)), 'Mismatch found in the samples provided with the metadata!'
current_sample = df_sampleinfo.SampleName.to_list()
print('No. of samples to analyze: {}'.format(len(current_sample)))
print('Name of the samples: {}'.format(current_sample))
print('healthyKORA + Student: {}'.format(len(kora_healthy+std_sample)))

No. of diabetic: 123
No. of obese: 1126
No. of cancer: 128
No. of healthy: 436
No. of samples to analyze: 5
Name of the samples: ['305-5-16641', '304-4-78826', '302-2-89334', '306-6-83608', '301-1-59020']
healthyKORA + Student: 731


# Generate outputs

## Merge Alpha diversity and Taxonomy binning of Current samples with the reference

In [8]:
import functions
alpha_div, tax_phyla, tax_genera = functions.merge_with_ref(ref_alpha,ref_phyla,ref_genera,input_dir)
print('Merged alpha diversity: {}'.format(alpha_div))
print('Merged phyla: {}'.format(tax_phyla))
print('Merged genera: {}'.format(tax_genera))

FileNotFoundError: [Errno 2] No such file or directory: 'test_data1\\Phyla.sample.tab'

## Plot Alpha diversity

In [6]:
import functions

for sample in current_sample:            
    quartile = functions.alpha_diversity(input_dir, sample, alpha_div, [('std',std_sample),('healthy',kora_healthy),('obese',kora_obese)],'Richness')
    with open(os.path.join(input_dir,sample,'report_EN.txt'),'w') as fp_en, open(os.path.join(input_dir,sample,'report_DE.txt'),'w') as fp_de:
        for c, p in quartile:
            fp_en.write(c+':'+p+'\n')
            p_de = 'unteres Quartil' if p=='lower quartile' else 'höheres Quartil' if p=='upper quartile' else 'mittleres Quartil'
            fp_de.write(c+':'+p_de+'\n')
    print(sample)
    # break

305-5-16641
304-4-78826
302-2-89334
306-6-83608
301-1-59020


## Prepare iTOL annotation file 
> Next follow the steps below,
> 1. Upload `individual_patient_report.nwk`
> 2. Upload iTOL annotation files
> 3. Change settings:
>> - Basic: Labels --> Hide
>> - Advanced: Tree scale box --> Hide
> 4. Export: 
>> - Format: png
>> - Resolution: 200 dpi
>> - Export area: Full screen
>> - File name: `sampletree`

In [7]:
import functions

# df_sampleinfo = pd.read_csv(info_file,sep='\t')
df_itol = df_sampleinfo.set_index('SampleName')
abun_table = tax_phyla
for sname in df_itol.index:
    print(sname)
    samplelist = kora_healthy + std_sample + [sname]
    output_file = os.path.join(input_dir,sname,'itol1_branch.txt')
    with open(output_file, 'w') as fp:
        fp.write('TREE_COLORS\nSEPARATOR COMMA\nDATA\nOROOT,clade,#000000,normal,3\n')
    output_file = os.path.join(input_dir,sname,'itol2_clade.txt')
    functions.clade_strip(sname, ref_clade, output_file)
    output_file = os.path.join(input_dir,sname,'itol3_phylum.txt')
    functions.stacked_bar_itol(abun_table, sname, samplelist, output_file)
    output_file = os.path.join(input_dir,sname,'itol4_marksample')
    ref_sample = kora_healthy + std_sample
    similar_sample = functions.mark_sample(sname,abun_table,ref_sample,output_file)
    clade_data = pd.read_table(ref_clade, sep='\t')
    clade = clade_data[clade_data['sample']==similar_sample]['clade'].to_list()[0]
    with open(os.path.join(input_dir,sname,'report_EN.txt'),'a') as fp_en, open(os.path.join(input_dir,sname,'report_DE.txt'),'a') as fp_de:
        fp_en.write('cluster:'+str(clade)+'\n')
        fp_de.write('cluster:'+str(clade)+'\n')
        percentage = 11 if clade==1 else 39 if clade==2 else 12 if clade==3 else 38
        fp_en.write('percentage:'+str(percentage)+'\n')
        fp_de.write('percentage:'+str(percentage)+'\n')
        distinct_genera = 'Prevotella-9' if clade==1 else 'Bacteroides' if clade==2 else 'Ruminococcaceae_UGC-002' if clade==3 else 'Ruminococcus'
        fp_en.write('distinguish:'+distinct_genera+'\n')
        fp_de.write('distinguish:'+distinct_genera+'\n')

    # break


305-5-16641
304-4-78826
302-2-89334
306-6-83608
301-1-59020


## Phylum barplot

In [8]:
import functions

abun_table = tax_phyla
abun_data = pd.read_table(abun_table, sep='\t').set_index('Unnamed: 0')
df_itol = df_sampleinfo.set_index('SampleName')
colors = {'p__Actinobacteriota':'#f5e4ea','p__Bacteroidota':'#94ac7c','p__Firmicutes':'#847cb6','p__Fusobacteriota':'#a28145',
          'p__Proteobacteria':'#e9c334','p__Verrucomicrobiota':'#7b98d1'}
for sname in df_itol.index:
    print(sname)
    hlthy2 = list(set(abun_data.columns.to_list()).intersection(set(healthy)))
    abun_data['avg_old'] = abun_data[hlthy2].mean(axis=1)
    abun_data['avg_young'] = abun_data[std_sample].mean(axis=1)
    df = abun_data[[sname,'avg_young','avg_old']]
    df_barplot = df.loc[list(colors.keys()),:]
    df_barplot = df_barplot.T
    # Color
    clr = [colors[i] if i in colors.keys() else '#bcbcbc' for i in df_barplot.columns]
    # remove phylum tag from the names
    df_barplot.columns = df_barplot.columns.str.replace("p__", "")
    # rename phylum with preferred name
    df_barplot = df_barplot.rename(columns={'Firmicutes':'Bacillota'})
    df_barplot = df_barplot.reindex(sorted(df_barplot.columns), axis=1)
    df_barplot['Others'] = df.loc[list(set(df.index.to_list()).difference(set(list(colors.keys())))),:].sum(axis='index').to_list()
    clr = clr+['#bcbcbc']
    xticks = ['Sample','Healthy young','Healthy old']
    ylabel = 'Relative abundance (%)'
    figfile = os.path.join(input_dir, sname,'phylum_EN.jpg')
    functions.stacked_bar_phylum(df_barplot,clr,xticks,ylabel,figfile)
    df_barplot = df_barplot.rename(columns={'Others':'Andere'})
    xticks = ['Probe','Gesund, jung','Gesund, alt']
    ylabel = 'Relative Abundanz (%)'
    figfile = os.path.join(input_dir, sname,'phylum_DE.jpg')
    functions.stacked_bar_phylum(df_barplot,clr,xticks,ylabel,figfile)

    # break

305-5-16641
304-4-78826
302-2-89334
306-6-83608
301-1-59020


## Boxplot: Bacillota/Bacteroidota

In [9]:
import functions

abun_table = tax_phyla
abun_data = pd.read_table(abun_table, sep='\t').set_index('Unnamed: 0')
df_itol = df_sampleinfo.set_index('SampleName')
for sname in df_itol.index:
    print(sname)
    hlthy2 = list(set(abun_data.columns.to_list()).intersection(set(healthy)))
    abun_data['avg_old'] = abun_data[hlthy2].mean(axis=1)
    abun_data['avg_young'] = abun_data[std_sample].mean(axis=1)
    
    fb_list, cat_class, quartile = [], [], []
    for cat, cat_sample in [('healthy_young',std_sample),('healthy_old',hlthy2)]:
        cat_fb = abun_data[cat_sample].T['p__Firmicutes']/abun_data[cat_sample].T['p__Bacteroidota'].to_list()
        fb_list.extend(cat_fb)
        cat_class.extend([cat]*len(cat_fb))
    sample_f_by_b = abun_data[sname].T['p__Firmicutes']/abun_data[sname].T['p__Bacteroidota']
    df_fb = pd.DataFrame(zip(fb_list,cat_class), columns=['f_by_b','class'])
    figfile = os.path.join(input_dir, sname,'f_b_ratio_EN.jpg')
    functions.box_plot(df_fb,sample_f_by_b,['Healthy young','Healthy old'],'Sample',figfile)
    figfile = os.path.join(input_dir, sname,'f_b_ratio_DE.jpg')
    functions.box_plot(df_fb,sample_f_by_b,['Gesund, jung','Gesund, alt'],'Probe',figfile)
    # break



305-5-16641
304-4-78826
302-2-89334
306-6-83608
301-1-59020


## Rename genus

In [10]:
rename_genus = {'LachnospiraceaeNK4A136group':'NK4A136 in Lachnospiraceae',
                'ChristensenellaceaeR-7group':'R-7 in Christensenellaceae',
                'NK4A214group':'NK4A214 in Ruminococcaceae',
                'UCG-002':'UGC-002 in Ruminococcaceae',
                'Eubacteriumeligensgroup':'Eubacterium-eligens-group',
                'Eubacteriumsiraeumgroup':'Eubacterium-siraeum-group',
                'PrevotellaceaeNK3B31group':'NK3B31 in Prevotellaceae',
                'UCG-005':'UCG-005 in Ruminococcaceae',
                'Eubacteriumhalliigroup':'Eubacterium-hallii-group',
                'LachnospiraceaeND3007group':'ND3007 in Lachnospiraceae',
                'Ruminococcusgnavusgroup':'Ruminococcus-gnavus-group',
                'Bacteroides pectinophilus group':'Bacteroides-pectinophilus-group',
                'Christensenellaceae R-7 group':'R-7 in Christensenellaceae ',
                'Lachnospiraceae NK4A136 group':'NK4A214 in Ruminococcaceae',
                'Rikenellaceae RC9 gut group':'Rikenellaceae-RC9-gut-group',
                'unknown_ Clostridia UCG-014':'Clostridia UCG-014',
                'unknown_ Clostridia vadinBB60 group':'Clostridia-vadinBB60-group',
                'unknown_ Desulfovibrionaceae':'Taxon in Desulfovibrionaceae',
                'unknown_ Eubacterium coprostanoligenes group':'Eubacterium-coprostanoligenes-group',
                'unknown_ Gastranaerophilales':'Taxon in Gastranaerophilales',
                'unknown_ Lachnospiraceae':'Taxon in  Lachnospiraceae',
                'unknown_ Muribaculaceae':'Taxon in  Muribaculaceae',
                'unknown_ Rhodospirillales':'Taxon in  Rhodospirillales',
                'unknown_ Ruminococcaceae':'Taxon in Ruminococcaceae',
                'Lachnospiraceae UCG-001':'UCG-001 in Lachnospiraceae',
                'LachnospiraceaeUCG-001':'UCG-001 in Lachnospiraceae',
                'UCG-003':'UCG-003 in Lachnospiraceae',
                'Ruminococcustorquesgroup':'Ruminococcus torques group',
                }

## Genera barplot (Top 10)

In [11]:
import functions

abun_table = tax_genera
abun_data = pd.read_table(abun_table, sep='\t').set_index('Unnamed: 0')
df_itol = df_sampleinfo.set_index('SampleName')
colors17= ['#3eb489','#ff6ec7','#ffd12b','#03324a','#16621c','#5f2e4c','#f9584b','#596fff','#81dd4d','#e6b710','#ff8980','#00FFFD','#b5db52','#FF6900','#c90076','#2ffd51','#ff8933']

# Unknown GOTUs
if os.path.isfile(gotu_names):
    df_gotu = pd.read_csv(gotu_names,sep='\t',index_col='former_name')
    gotu_dict = df_gotu.to_dict()['Genus']
else:
    gotu_dict = {}
    
for sname in df_itol.index:
    # KORA-Healty
    print(sname)
    hlthy2 = list(set(abun_data.columns.to_list()).intersection(set(healthy)))
    abun_data['avg_old'] = abun_data[hlthy2].mean(axis=1)
    # Student
    abun_data['avg_young'] = abun_data[std_sample].mean(axis=1)
    
    df_avg = abun_data[[sname,'avg_young','avg_old']]
    # print(df_avg)
    sample_top_genus = df_avg.nlargest(1,sname).index.to_list()[0].replace('g__','')
    with open(os.path.join(input_dir,sname,'report_EN.txt'),'a') as fp_en, open(os.path.join(input_dir,sname,'report_DE.txt'),'a') as fp_de:
        fp_en.write('topgenus:'+sample_top_genus+'\n')
        fp_de.write('topgenus:'+sample_top_genus+'\n')
    top_old = df_avg.nlargest(10,'avg_old').index.to_list()
    top_young = df_avg.nlargest(10,'avg_young').index.to_list()
    top_sample = df_avg.nlargest(10,sname).index.to_list()
    top = set(top_old).union(set(top_young)).union(set(top_sample))
    df_barplot = df_avg.loc[list(top)].sort_values('avg_young', ascending=False)
    # df_barplot.loc['others'] = df_avg.loc[list(set(df_avg.index.to_list()).difference(top)),:].sum(axis='index').to_list()

    df_barplot = df_barplot.T
    # remove phylum tag from the names
    df_barplot.columns = df_barplot.columns.str.replace("g__", "")
    # rename genus with preferred name
    df_barplot = df_barplot.rename(columns=rename_genus)
    df_barplot = df_barplot.reindex(sorted(df_barplot.columns), axis=1)
    # print(df_barplot.T.sort_values(sname,axis=0))
    df_barplot = df_barplot.T.sort_values(sname,axis=0,ascending=False).T
    index_genera = df_barplot.columns.to_list() 
    df_barplot['Others'] = df_avg.loc[list(set(df_avg.index.to_list()).difference(top)),:].sum(axis='index').to_list()
    # rename unknown genus with the names present in "gotu_name.tab"
    df_barplot = df_barplot.rename(columns=gotu_dict)
    # plt.subplots(figsize=(2, 10))
    with open(os.path.join(input_dir, sname,'topGenera.pkl'), 'wb') as f:
        pickle.dump(df_barplot.columns.to_list(), f)
    figfile = os.path.join(input_dir, sname,'genera_EN.jpg')
    functions.stacked_bar_genus(df_barplot,top,colors17,['Sample','Healthy young', 'Healthy old'],'Relative abundance (%)',figfile)
    df_barplot = df_barplot.rename(columns={'Others':'Andere'})
    figfile = os.path.join(input_dir, sname,'genera_DE.jpg')
    functions.stacked_bar_genus(df_barplot,top,colors17,['Probe','Gesund, jung','Gesund, alt'],'Relative abundanz (%)',figfile)

    # break



305-5-16641
304-4-78826
302-2-89334
306-6-83608
301-1-59020


# Write into PDF file

In [12]:
class PDF(FPDF):
    def __init__(self, ordernum,samplename):
        super(PDF, self).__init__()
        self.ordernum = ordernum
        self.samplename = samplename
    def footer(self):
        # Go to 1.5 cm from bottom
        self.set_y(-20)
        # Select Arial italic 8
        self.set_font('Arial', 'I', 8)
        # Text color in gray
        self.set_text_color(128)
        # Print right order-number
        self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')


today = date.today()
analysis_date = today.strftime("%d.%m.%Y")
missingGenera = []
sample_dir = input_dir
for idx in df_sampleinfo.index:
    #---------------------#
    #-- English Version --#
    #---------------------#
    pdf = PDF(df_sampleinfo.loc[idx,'OrderNumber'],df_sampleinfo.loc[idx,'SampleName'])
    pdf.alias_nb_pages()
    pdf.add_page()
    pdf.set_margins(left=20,right=20,top=10)
    # Add title
    pdf.set_font('Helvetica', 'B', 16)
    # Title
    pdf.cell(170, 10, 'Microbiome Analysis for Sample '+str(df_sampleinfo.loc[idx,'Screening-ID']), border =0, align='C')
    pdf.ln(9)
    # Sample description
    pdf.set_font('Helvetica', '', 11)
    pdf.rect(19,20,172,14)
    pdf.cell(70, 10, 'Age (on collection date): '+ str(df_sampleinfo.loc[idx,'Age']), align='L', border=0)
    try:
        collect_date = str(datetime.strptime(df_sampleinfo.loc[idx,'SamplingDate'], '%d.%m.%y').date().strftime('%d.%m.%Y'))
    except ValueError:
        collect_date = df_sampleinfo.loc[idx,'SamplingDate']
    pdf.cell(100, 10, 'Collection date: '+ collect_date, align='R', border=0)
    pdf.ln(6)
    pdf.cell(70, 10, 'Sample type: '+df_sampleinfo.loc[idx,'SampleType'].capitalize(), align='L', border=0)
    pdf.cell(100, 10, 'Analysis date: '+ str(analysis_date), align='R', border=0)
    pdf.ln(11)

    # Alpha diversity: Boxplot
    pdf.set_font('Helvetica', 'B', 11)
    pdf.rect(19,37,101,40)
    pdf.cell(99, 10, 'Richness analysis', align='L', border=0)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'alpha-diversity_EN.jpg'), x=22, y=44, w=96)
    # Conclusion
    conclu_dict = {}
    sname = df_sampleinfo.loc[idx,'SampleName']
    with open(os.path.join(input_dir,sname,'report_EN.txt'),'r') as fp:
        lines = fp.readlines()
        for line in lines:
            conclu_dict.update({line.strip().split(':')[0]:line.strip().split(':')[1]})
    pdf.rect(123,37,68,40)
    pdf.cell(5)
    pdf.cell(66, 10, 'Conclusion', align='L', border=0)
    pdf.set_font('Helvetica', '', 9)
    pdf.ln(10)
    pdf.cell(105)
    if isinstance(df_sampleinfo.loc[idx,'Age'],str):
        pdf.multi_cell(w=64,text='Number of taxa: '+conclu_dict['std']+' of the "Healthy young" cohort (age < 40)',align='J') 
    else:
        if df_sampleinfo.loc[idx,'Age'] < 40 or isinstance(df_sampleinfo.loc[idx,'Age'],str):
            pdf.multi_cell(w=64,text='Number of taxa: '+conclu_dict['std']+' of the "Healthy young" cohort (age < 40)',align='J') 
        else:
            pdf.multi_cell(w=64,text='Number of taxa: '+conclu_dict['healthy']+' of the "Healthy old" cohort (age > 40)',align='J')
    pdf.ln(3)
    pdf.cell(105)
    x, y = pdf.x, pdf.y
    pdf.multi_cell(w=64,
                   text='The sample is in Cluster '+conclu_dict['cluster']+' (typical for ~'+conclu_dict['percentage']+'% of persons), distinguished by __'+conclu_dict['distinguish']+'__',
                   align='J', markdown=True)
    
    pdf.ln(3)
    pdf.cell(105)
    x, y = pdf.x, pdf.y
    pdf.multi_cell(w=45,text='Most abundant genus is __'+conclu_dict['topgenus']+'__',align='J',markdown=True)
    pdf.set_font('Helvetica', 'B', 11)
    # pdf.ln(5)
    
    # Tree: Phylum
    pdf.rect(19,80,101,100)
    pdf.set_xy(20,79)
    pdf.cell(99, 10, 'Reference: Healthy young and old population', align='L', border=0)
    im = Image.open(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'sampletree.png'))
    if im.size[0]/im.size[1] < 1:
        pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'sampletree.png'), x=29, y= 90, h=87)
    else:
        pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'sampletree.png'), x=29, y= 90, w=87)
    pdf.image(os.path.join(ref_db,'clade_legend_EN.png'), x=20, y= 161, w=15)
    # Phylum barplot
    pdf.rect(123,80,68,100)
    pdf.cell(5)
    pdf.cell(66, 10, 'Phylum analysis', align='L', border=0)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'phylum_EN.jpg'), x=128, y= 90, h=89)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'f_b_ratio_EN.jpg'), x=159, y= 124, h=55)
    pdf.ln(103)

    # Top 10 genera
    pdf.rect(19,183,172,94)
    pdf.cell(169, 10, 'Genus analysis (Top 10 genera)', align='L', border=0)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'genera_EN.jpg'), x=22, y= 191, h=85)

    # Table with genus function
    df_func = pd.read_csv(ref_func,sep='\t',encoding = "utf16").set_index('Genus')
    df_func = df_func[['Function']]
    with open(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'topGenera.pkl'),'rb') as f: topGenera = pickle.load(f)        
    commonGenera = [g for g in topGenera if g in df_func.index.to_list()]
    commonGenera.reverse()
    missingGenera.extend([g for g in topGenera if not g in df_func.index.to_list()])
    df_func = df_func.loc[commonGenera,:].reset_index().map(str)
    data = [list(df_func)] + df_func.values.tolist()  # Combine columns and rows in one list
    pdf.ln(6)
    headings_style = FontFace(emphasis="BOLD",fill_color=(200, 200, 200))
    pdf.set_font('Helvetica', '', 7)
    with pdf.table(width=101, col_widths=(25,76), align='R', first_row_as_headings=True,headings_style=headings_style,line_height=1.1 * pdf.font_size) as table:
        INDEX_OF_COLUMN_IN_ITALICS = 0
        ITALICS = FontFace(emphasis="ITALICS")
        header = True
        for data_row in data:
            row = table.row()
            for i, datum in enumerate(data_row):
                row.cell(datum, style=ITALICS if not header and i == INDEX_OF_COLUMN_IN_ITALICS else None)
                header = False

    #----------------------#
    #--  German Version  --#
    #----------------------#
    pdf.add_page()
    pdf.set_margins(left=20,right=20,top=10)
    # Add title
    pdf.set_font('Helvetica', 'B', 16)
    # Title
    pdf.cell(170, 10, 'Mikrobiom Analyse für Probe '+str(df_sampleinfo.loc[idx,'Screening-ID']), border =0, align='C')
    pdf.ln(9)
    # Sample description
    pdf.set_font('Helvetica', '', 11)
    pdf.rect(19,20,172,14)
    pdf.cell(70, 10, 'Alter (am Sammeltag): '+ str(df_sampleinfo.loc[idx,'Age']), align='L', border=0)
    pdf.cell(100, 10, 'Sammeldatum: '+ collect_date, align='R', border=0)
    pdf.ln(6)
    sample_type = 'Stuhl' if df_sampleinfo.loc[idx,'SampleType'].lower()=='stool' else 'Unknown'
    pdf.cell(70, 10, 'Probentyp: '+sample_type, align='L', border=0)
    pdf.cell(100, 10, 'Analysendatum: '+ str(analysis_date), align='R', border=0)
    pdf.ln(11)

    # Alpha diversity: Boxplot
    pdf.set_font('Helvetica', 'B', 11)
    pdf.rect(19,37,101,40)
    pdf.cell(99, 10, 'Analyse des Artenreichtums', align='L', border=0)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'alpha-diversity_DE.jpg'), x=22, y=44, w=96)
    # Conclusion
    conclu_dict = {}
    sname = df_sampleinfo.loc[idx,'SampleName']
    with open(os.path.join(input_dir,sname,'report_DE.txt'),'r') as fp:
        lines = fp.readlines()
        for line in lines:
            conclu_dict.update({line.strip().split(':')[0]:line.strip().split(':')[1]})
    pdf.rect(123,37,68,40)
    pdf.cell(5)
    pdf.cell(66, 10, 'Zusammenfassung', align='L', border=0)
    pdf.set_font('Helvetica', '', 9)
    pdf.ln(10)
    pdf.cell(105)
    if isinstance(df_sampleinfo.loc[idx,'Age'],str):
        pdf.multi_cell(w=64,text='Anzahl der Taxa: '+conclu_dict['std']+' der Kohorte "Gesund, jung" (Alter < 40)',align='J') 
    else:
        if df_sampleinfo.loc[idx,'Age'] < 40 :
            pdf.multi_cell(w=64,text='Anzahl der Taxa: '+conclu_dict['std']+' der Kohorte "Gesund, jung" (Alter < 40)',align='J') 
        else:
            pdf.multi_cell(w=64,text='Anzahl der Taxa: '+conclu_dict['healthy']+' der Kohorte "Gesund, alt" (Alter > 40)',align='J')
    pdf.ln(3)
    pdf.cell(105)
    x, y = pdf.x, pdf.y
    pdf.multi_cell(w=64,
                   text='Die Probe fällt in Cluster '+conclu_dict['cluster']+' (typisch für ~'+conclu_dict['percentage']+'% der Personen), gekennzeichnet durch __'+conclu_dict['distinguish']+'__',
                   align='J', markdown=True)
    pdf.ln(3)
    pdf.cell(105)
    pdf.multi_cell(w=45,text='Die häufigste Gattung ist __'+conclu_dict['topgenus']+'__',align='J',markdown=True)
    pdf.set_font('Helvetica', 'B', 11)
    
    # Tree: Phylum
    pdf.rect(19,80,101,100)
    pdf.set_xy(20,79)
    pdf.cell(99, 10, 'Referenz: Gesunde, junge und alte Menschen', align='L', border=0)
    im = Image.open(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'sampletree.png'))
    if im.size[0]/im.size[1] < 1:
        pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'sampletree.png'), x=29, y= 90, h=87)
    else:
        pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'sampletree.png'), x=29, y= 90, w=87)
    pdf.image(os.path.join(ref_db,'clade_legend_DE.png'), x=20, y= 161, w=15)
    # Phylum barplot
    pdf.rect(123,80,68,100)
    pdf.cell(5)
    pdf.cell(66, 10, 'Analyse der Stämme', align='L', border=0)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'phylum_DE.jpg'), x=128, y= 90, h=89)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'f_b_ratio_DE.jpg'), x=159, y= 124, h=55)
    pdf.ln(103)

    # Top 10 genera
    pdf.rect(19,183,172,94)
    pdf.cell(169, 10, 'Analyse der Gattungen (Obere 10)', align='L', border=0)
    pdf.image(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'genera_DE.jpg'), x=22, y= 191, h=85)

    # Table with genus function
    df_func = pd.read_csv(ref_func,sep='\t',encoding = "utf16").rename(columns={'Genus':'Gattung'}).set_index('Gattung')
    df_func = df_func[['Funktion']]
    with open(os.path.join(input_dir,df_sampleinfo.loc[idx,'SampleName'],'topGenera.pkl'),'rb') as f: topGenera = pickle.load(f)        
    commonGenera = [g for g in topGenera if g in df_func.index.to_list()]
    commonGenera.reverse()
    missingGenera.extend([g for g in topGenera if not g in df_func.index.to_list()])
    df_func = df_func.loc[commonGenera,:].reset_index().map(str)
    data = [list(df_func)] + df_func.values.tolist()  # Combine columns and rows in one list
    pdf.ln(6)
    headings_style = FontFace(emphasis="BOLD",fill_color=(200, 200, 200))
    pdf.set_font('Helvetica', '', 7)
    with pdf.table(width=101, col_widths=(25,76), align='R', first_row_as_headings=True,headings_style=headings_style,line_height=1.1 * pdf.font_size) as table:
        INDEX_OF_COLUMN_IN_ITALICS = 0
        ITALICS = FontFace(emphasis="ITALICS")
        header = True
        for data_row in data:
            row = table.row()
            for i, datum in enumerate(data_row):
                row.cell(datum, style=ITALICS if not header and i == INDEX_OF_COLUMN_IN_ITALICS else None)
                header = False
    
    if not os.path.exists(os.path.join(input_dir,'pdf')): os.mkdir(os.path.join(input_dir,'pdf'))
    pdf.output(os.path.join(input_dir,'pdf',str(df_sampleinfo.loc[idx,'Screening-ID'])+'.pdf'))

  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')
  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')
  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')
  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')
  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')
  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')
  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.samplename), 0, 0, 'R')
  self.set_font('Arial', 'I', 8)
  self.cell(0, 10, 'Order: '+str(self.ordernum)+' / Sample: '+str(self.

# Show missing Genera

In [13]:
print(set(missingGenera))

{'Others'}
