In [1]:
import sys
import os
import numpy as np
import matplotlib
# matplotlib.use('pdf') # do this because environment does not have GUI backend
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D
from scipy.special import factorial
import seaborn as sns
import pandas as pd
import statistics
from scipy import stats
import math
import mpl_axes_aligner
import regex as re
# !{sys.executable} -m pip install regex

font = {'family' : 'Arial',
        'size'   : 8}

matplotlib.rc('font', **font)

print('Complete')

Complete


### Herpesvirus Domains

#### Prepare files for pulling info

In [2]:
# get domains and essential regions to report
# [VIRF2, VIRF4, EBNA2, VIRF1, RTA, UL29, BZLF1, BZLF1]
# tileIDs = ['Q2HR71_028', 'Q2HR71_031', 'Q2HR73_032', 'Q2HR73_052', 'Q2HR73_062', 'P12978_010', 'P12978_040', 'F5HF68_016', 'F5HCV3_058', 'C0H677_035', 'P03206_001', 'P03206_003', 'P03206_015']
# tileIDs = ['P04296_108', 'P04296_109', 'P52339_106', 'P52339_107', 'P03206_015', 'Q2HRD3_107']
doms = pd.read_csv('../Supplemental Tables - CSV/HHV-tiling/HHV_domains_estON-OFF.csv')
maxtiles = pd.read_csv('../Supplemental Tables - CSV/HHV-perturbation/HHV-perturb_max-tiles_UPDATED.csv')
actess = pd.read_csv('../Supplemental Tables - CSV/HHV-perturbation/HHVpert_Activation_domain_breaking_motifs_simplified.csv')
repess = pd.read_csv('../Supplemental Tables - CSV/HHV-perturbation/HHVpert_Repression_domain_breaking_motifs_simplified.csv')

def sort_ess(df):
    df['MStart'] = np.where(df['Motif Start']==0, df['Breaking Start'], df['Motif Start'])
    df['BStart'] = np.where(df['Breaking Start']==0, df['Motif Start'], df['Breaking Start'])
    df['Start'] = np.where(df['MStart'] < df['BStart'], df['MStart'], df['BStart'])
    df = df.sort_values(by=['Start'], ascending=True)
    return df

actess = sort_ess(actess)
repess = sort_ess(repess)

maxIDs = list(set(list(maxtiles['Tile ID'])))
domIDs = list(set(list(doms['Tile ID'])))
maxonly = [i for i in maxIDs if i not in domIDs]

doms = doms[['Protein ID', 'Tile ID', 'Extended Start', 'Extended End', 'Extended Sequence',
             'Max Start', 'Max End', 'Max Sequence', 'Effect', 'Virus', 'Protein', 'UniProt Protein Family',
             'Gene', 'R1 Est. %ON', 'R2 Est. %ON', 'R1 Est. %OFF', 'R2 Est. %OFF']]
maxtiles = maxtiles[maxtiles['Tile ID'].isin(maxonly)]
maxtiles = maxtiles[['Protein ID', 'Tile ID', 'Max Start', 'Max End', 'Max Sequence', 'Effect', 'Virus', 'Protein', 'UniProt Protein Family', 'Gene']]
doms = pd.concat([doms, maxtiles]).drop_duplicates(subset=['Tile ID', 'Effect']).sort_values(by='Tile ID').reset_index(drop=True)
doms['Gene'] = doms['Gene'].replace([np.nan], ['-'])

# fill in info for the two tiles not in the domains file but that were mutated
for a, b in zip(['F5HIC6_027', 'P52351_041'], ['F5HIC6_025', 'P52351_038']):
    doms.loc[doms['Tile ID']==a, 'Extended Start'] = list(doms.loc[doms['Tile ID']==b, 'Extended Start'])[0]
    doms.loc[doms['Tile ID']==a, 'Extended End'] = list(doms.loc[doms['Tile ID']==b, 'Extended End'])[0]
    doms.loc[doms['Tile ID']==a, 'Extended Sequence'] = list(doms.loc[doms['Tile ID']==b, 'Extended Sequence'])[0]

doms.loc[doms['Tile ID']=='F5HIC6_027', 'R1 Est. %ON'] = 88.53384966
doms.loc[doms['Tile ID']=='F5HIC6_027', 'R2 Est. %ON'] = 83.89800185
doms.loc[doms['Tile ID']=='P52351_041', 'R1 Est. %OFF'] = 54.25129388
doms.loc[doms['Tile ID']=='P52351_041', 'R2 Est. %OFF'] = 54.35137983

# update effect for VIRF3 tile from dual to activation
doms.loc[doms['Tile ID']=='F5HIC6_027', 'Effect'] = 'Activation'
# doms = doms[~((doms['Tile ID'].isin(['Q9J3N2_006', 'F5HIC6_025'])) & (doms['Effect']=='Activation'))]
# print(doms[doms['Protein ID']=='P03227'])

# # toggle on/off for testing
# def test_set(dflist):
#     return [df[df['Tile ID'].isin(tileIDs)] for df in dflist]
# doms, actess, repess = test_set([doms, actess, repess])

tiling_plot_dir = '../Individual Figure Panels/HHV_indivProteins_allTiles_150-dpi'
act_plot_dir = '../Individual Figure Panels/HHV_Activator_perturbations_150-dpi'
rep_plot_dir = '../Individual Figure Panels/HHV_Repressor_perturbations_150-dpi'

# tiling_plot_dir = '../Individual Figure Panels/HHV_indivProteins_allTiles_300-dpi'
# act_plot_dir = '../Individual Figure Panels/HHV_Activator_perturbations_300-dpi'
# rep_plot_dir = '../Individual Figure Panels/HHV_Repressor_perturbations_300-dpi'

print('Files and paths reported')

Files and paths reported


#### Script to produce a PDF summary of the data, with auxiliary functions

In [3]:
# Author: Connor Ludwig
# Organization: Bintu Lab
# Date: 2/4/2023

# !{sys.executable} -m pip install fpdf
from fpdf import FPDF
import math

##### AUXILIARY FUNCTIONS #####
# function to calculate the x value needed to center an image
def center(width):
    page_width = 210
    margin = 10
    x = (page_width - width)/2
    return x

# function to make a table that accounts for single- and multi-line entries (accommodates text wrapping)
def table(df):
    # edit input table as needed
    df = df.fillna('')
    df['Motif overlaps Breaking Region?'] = df['Motif overlaps Breaking Region?'].replace([0, 1], ['No', 'Yes'])
    df['Motif Start'] = df['Motif Start'].replace([0], ['-'])
    df['Motif End'] = df['Motif End'].replace([0], ['-'])
    df['Breaking Start'] = df['Breaking Start'].replace([0], ['-'])
    df['Breaking End'] = df['Breaking End'].replace([0], ['-'])
    
    # define dictionary with information regarding renaming column, column width, and whether to start a new row
    colDict = {'Motif Name':['Motif Name', 24, 0], 'Motif Start':['Motif Start', 16, 0], 'Motif End':['Motif End', 16, 0],
               'Motif Sequence':['Motif Sequence', 24, 0], 'Motif overlaps Breaking Region?':['Overlaps Essential?', 28, 0],
               'Breaking Start':['Essential Start', 20, 0], 'Breaking End':['Essential End', 20, 0],
               'Breaking Sequence':['Essential Sequence', 42, 1]}

    # pre-calculate height of table to see if it will fit in one piece or if a new page should be added
    total_height = 4 # header
    for p in range(len(df)):
        if len(df.iloc[p, df.columns.get_loc('Breaking Sequence')]) > 20:
            total_height += 4*int(math.ceil(len(df.iloc[p, df.columns.get_loc('Breaking Sequence')])/20))
        else:
            total_height += 4
    
    # check how much space is left
    if (pdf.y + total_height) > 264:
        pdf.add_page()
        yoffset = 10
    
    # first create table header
    pdf.set_fill_color(r=230, g=230, b=230)
    pdf.set_font('Arial', '', size=text_fontsize)
    for c in colDict:
        pdf.cell(w=colDict[c][1], h=4, txt='\n'.join(colDict[c][0].split('_')), border=1, fill=True, align='C', ln=colDict[c][2])

    # populate table one cell at a time with motif and essential region information
    for p in range(len(df)):
        # for column to add
        for i, c in zip(range(len(colDict)), colDict):
            # if essential sequence is longer than 20 characters (width of the cell)
            if len(df.iloc[p, df.columns.get_loc('Breaking Sequence')]) > 20:
                # calculate the number of lines the sequence will occupy
                numlines = int(math.ceil(len(df.iloc[p, df.columns.get_loc('Breaking Sequence')])/20))
                lineDict= {2:8, 3:6} # for some reason, two-line cells need h=8 and three-line cells need h=6 to be flush
                                    
                # if first column, initialize x and y offset
                if i == 0:
                    xoffset = 10
                    yoffset = pdf.y
                # if not the first column, need to update x offset to make cells adjacent
                elif i > 0:
                    xoffset += colDict[list(colDict.keys())[i-1]][1]
                    
                # set offset values for x and y
                pdf.x = xoffset
                pdf.y = yoffset

                # for the essential sequence cell, use multi-cell for text wrapping
                if c == 'Breaking Sequence':
                    pdf.multi_cell(w=colDict[c][1], h=4, txt=str(df.iloc[p, df.columns.get_loc(c)]), border=1, align='L')
                # for other cells, pad with new lines and use multi-cell so that cell height matches the essential sequence cell
                else:
                    entry = str(df.iloc[p, df.columns.get_loc(c)])
                    for i in range(numlines - 1):
                        entry += '\n'
                    pdf.multi_cell(w=colDict[c][1], h=lineDict[numlines], txt=entry, border=1, align='L')
                    
            # if essential sequence is 20 characters or shorter, construct normal cell
            else:
                pdf.cell(w=colDict[c][1], h=4, txt=str(df.iloc[p, df.columns.get_loc(c)]), border=1, align='L', ln=colDict[c][2])
    pdf.ln(5)
    
    
def percent_effect(df, row, effect, which_val):
    if effect == 'Activation':
        text = '%ON'
    elif effect == 'Repression':
        text = '%OFF'
    R1 = round(df.iloc[row, df.columns.get_loc('R1 Est. ' + text)], 1)
    R2 = round(df.iloc[row, df.columns.get_loc('R2 Est. ' + text)], 1)
    
    if which_val == 'lower':
        return str(min([R1, R2]))
    elif which_val == 'upper':
        return str(max([R1, R2]))
    
effect_text = {'Activation':'activated', 'Repression':'repressed'}
    
    
class PDF(FPDF):
    def __init__(self):
        super(PDF,self).__init__()
        self.figcount=1
        
    def header(self):
        self.set_font('Arial', 'B', 8)
        self.cell(0, 4, 'Herpesvirus Tiling and Perturbation Screen Data from Ludwig, C.H. et al. 2023', 0, 0, 'C')
        self.ln(6)
        
    def footer(self):
        # Position at 1 cm from bottom
        self.set_y(-15)
        self.set_font('Arial', '', 8)
        # Page number
        self.cell(0, 10, str(self.page_no()), 0, 0, 'C')
#         self.cell(0, 10, 'Page ' + str(self.page_no()) + ' of {nb}', 0, 0, 'C')


# for EBNA proteins
EBNAstrain = {'P03203':['B95-8', 'Type 1'],
              'P03204':['B95-8', 'Type 1'],
              'P03211':['B95-8', 'Type 1'],
              'P12977':['B95-8', 'Type 1'],
              'P12978':['B95-8', 'Type 1'],
              'Q1HVF7':['AG876', 'Type 2'],
              'Q1HVG4':['AG876', 'Type 2'],
              'Q3KSS4':['GD1', 'Type 1'],
              'Q3KST0':['GD1', 'Type 1'],
              'Q3KSV2':['GD1', 'Type 1'],
              'Q69022':['AG876', 'Type 2'],
              'Q69138':['AG876', 'Type 2'],
              'Q69140':['AG876', 'Type 2']}

##### CONSTRUCT PDF #####
w1 = 100 # tiling plot width
w2 = 160 # perturbation plot width
title_fontsize = 10 # page title font size
text_fontsize = 8 # all other text font size

pdf = PDF()
pdf.alias_nb_pages()

# for each tiling plot
for plot in os.listdir(tiling_plot_dir):
    # define path to the plot
    plot_path = os.path.join(tiling_plot_dir, plot)
    # extract information from plot file title
    protein, virus, uID = plot.split('_')[0:3]
    
    # if the protein (identified by UniProt ID) has a domain 
    if uID in list(doms['Protein ID']):
        print('Adding information for %s %s (%s)' % (virus, protein, uID))
        gene = list(doms.loc[doms['Protein ID']==uID, 'Gene'])[0]
        family = list(doms.loc[doms['Protein ID']==uID, 'UniProt Protein Family'])[0]
        # make new page of PDF and create title cell
        pdf.add_page() 
        pdf.set_font('Arial', '', size=title_fontsize)
        pdf.cell(w=0, h=6, txt=(virus + ' ' + protein + ' (' + uID + ')'), ln=2, align='C')
        pdf.set_font('Arial', '', size=text_fontsize)
        if uID in EBNAstrain:
            pdf.cell(w=0, h=4, txt=('Gene: ' + gene + ' ; Protein Family: ' + family + ' ; EBV Strain: ' + \
                                    EBNAstrain[uID][0] + ' (' + EBNAstrain[uID][1] + ')'), ln=2, align='C')
        else:
            pdf.cell(w=0, h=4, txt=('Gene: ' + gene + ' ; Protein Family: ' + family), ln=2, align='C')
        pdf.ln(4)

        # plot tiling plot
        print(' - adding tiling plot')
        pdf.image(plot_path, x=center(w1), w=w1)
        pdf.ln(2)
        pdf.line(10, pdf.y, 200, pdf.y)
        pdf.ln(4)
        
        # create temporary data frame with only protein info
        tempdf = doms[doms['Protein ID']==uID]
        
        # iterate over rows in dataframe and write domain information (extended and max)
        for i in range(len(tempdf)):
            # extended sequence info
            caption_vals_ext = (tempdf.iloc[i, tempdf.columns.get_loc('Effect')].lower(),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Extended Start')])),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Extended End')])),
                                tempdf.iloc[i, tempdf.columns.get_loc('Extended Sequence')])
            pdf.write(4, 'Extended %s domain from residues %s to %s: %s' % caption_vals_ext)
            pdf.ln(6)
        
            # max sequence info
            caption_vals_max = (tempdf.iloc[i, tempdf.columns.get_loc('Effect')].lower(),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Max Start')])),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Max End')])),
                                percent_effect(tempdf, i, tempdf.iloc[i, tempdf.columns.get_loc('Effect')], 'lower'),
                                percent_effect(tempdf, i, tempdf.iloc[i, tempdf.columns.get_loc('Effect')], 'upper'),
                                effect_text[tempdf.iloc[i, tempdf.columns.get_loc('Effect')]],                                
                                tempdf.iloc[i, tempdf.columns.get_loc('Max Sequence')])
            pdf.write(4, 'Max tile of %s domain from residues %s to %s (estimated %s%% to %s%% of cells %s): %s' % caption_vals_max)
            pdf.ln(4)
        
            tile = tempdf.iloc[i, tempdf.columns.get_loc('Tile ID')]
            effect = tempdf.iloc[i, tempdf.columns.get_loc('Effect')]
            
            # get list of tiles for the protein
            if effect == 'Activation':
                # compile list of all activation perturbation plots for this protein
                actfile = [os.path.join(act_plot_dir, a) for a in os.listdir(act_plot_dir) if tile in a]
                # if plot exists, add both plot and motif/essential sequence info
                if len(actfile) > 0:
                    print(' - adding activation perturbation plot')
                    pdf.ln(2)
                    # check how much space is left
                    if pdf.y + 41 > 264:
                        pdf.add_page()
                    pdf.image(actfile[0], x=center(w2), w=w2)
                    tempA = actess[actess['Tile ID']==tile]
                    # make table
                    table(tempA)
                # if no plot, add some space before next domain
                else:
                    pdf.ln(4)
                    
            elif effect == 'Repression':
                # special cases for HSV1 DBP and HHV7 DBP
                if (tile == 'P04296_108'):
                    tile = 'P04296_109'
                    pdf.write(4, 'Neighboring tile of repression domain from 1081 to 1160:')
                    pdf.ln(4)
                elif (tile == 'P52339_107'):
                    tile = 'P52339_106'
                    pdf.write(4, 'Neighboring tile of repression domain from 1052 to 1131:')
                    pdf.ln(4)
                # compile list of all activation perturbation plots for this protein
                repfile = [os.path.join(rep_plot_dir, r) for r in os.listdir(rep_plot_dir) if tile in r]
                # if plot exists, add both plot and motif/essential sequence info
                if len(repfile) > 0:
                    print(' - adding repression perturbation plot')
                    pdf.ln(2)
                    # check how much space is left
                    if pdf.y + 41 > 264:
                        pdf.add_page()
                    pdf.image(repfile[0], x=center(w2), w=w2)
                    tempR = repess[repess['Tile ID']==tile]
                    # make table
                    table(tempR)
                # if no plot, add some space before next domain
                else:
                    pdf.ln(4)
                    
            pdf.line(10, pdf.y, 200, pdf.y)
            pdf.ln(4)
                      
# save PDF
# pdf.output('../test_fpdf.pdf', 'F')
pdf.output('../Ludwig_2023_Supplement_HHV_Domain_Details.pdf', 'F')

print('Completed')

Adding information for HHV6B AN (P52448)
 - adding tiling plot
Adding information for EBV BDLF3 (P03224)
 - adding tiling plot
Adding information for EBV BILF2 (P03218)
 - adding tiling plot
Adding information for EBV BLRF2 (P0C717)
 - adding tiling plot
Adding information for EBV BZLF1 (P03206)
 - adding tiling plot
 - adding repression perturbation plot
 - adding repression perturbation plot
Adding information for HCMV CEP2 (P16800)
 - adding tiling plot
Adding information for HSV2 CEP2 (P89439)
 - adding tiling plot
 - adding repression perturbation plot
Adding information for HCMV CTNP (P16793)
 - adding tiling plot
Adding information for HHV7 CTNP (P52464)
 - adding tiling plot
Adding information for VZV CTNP (P09282)
 - adding tiling plot
Adding information for EBV CVC1 (P03222)
 - adding tiling plot
 - adding repression perturbation plot
Adding information for HCMV CVC1 (P16799)
 - adding tiling plot
Adding information for HSV1 CVC1 (P10201)
 - adding tiling plot
 - adding repre

Adding information for VZV ITP (P09277)
 - adding tiling plot
 - adding repression perturbation plot
Adding information for KSHV KbZIP (Q2HR82)
 - adding tiling plot
 - adding repression perturbation plot
Adding information for KSHV LANA1 (Q9QR71)
 - adding tiling plot
 - adding repression perturbation plot
Adding information for EBV LF1 (Q8AZJ5)
 - adding tiling plot
Adding information for HHV6A LJ1 (Q69545)
 - adding tiling plot
Adding information for EBV LTP (P03186)
 - adding tiling plot
 - adding activation perturbation plot
 - adding activation perturbation plot
Adding information for HSV1 LTP (P10220)
 - adding tiling plot
 - adding repression perturbation plot
 - adding activation perturbation plot
Adding information for HSV2 LTP (P89459)
 - adding tiling plot
Adding information for KSHV LTP (Q2HR64)
 - adding tiling plot
Adding information for VZV LTP (Q4JQX9)
 - adding tiling plot
Adding information for HCMV MCP (P16729)
 - adding tiling plot
Adding information for HHV7 MCP (

### Other Domains

#### Prepare files for pulling info

In [4]:
# get domains and essential regions to report
# [VIRF2, VIRF4, EBNA2, VIRF1, RTA, UL29, BZLF1, BZLF1]
# tileIDs = ['Q2HR71_028', 'Q2HR71_031', 'Q2HR73_032', 'Q2HR73_052', 'Q2HR73_062', 'P12978_010', 'P12978_040', 'F5HF68_016', 'F5HCV3_058', 'C0H677_035', 'P03206_001', 'P03206_003', 'P03206_015']
# tileIDs = ['P04296_108', 'P04296_109', 'P52339_106', 'P52339_107', 'P03206_015', 'Q2HRD3_107']
vTR = pd.read_csv('../Supplemental Tables - CSV/vTR-CoV-tiling/vTR_domains_estON-OFF.csv')
CoV = pd.read_csv('../Supplemental Tables - CSV/vTR-CoV-tiling/CoV_domains_estON-OFF.csv')
HHV = pd.read_csv('../Supplemental Tables - CSV/HHV-tiling/HHV_domains_estON-OFF.csv')

vTR_tiles = list(set(list(vTR['Protein ID'])))
CoV_tiles = list(set(list(CoV['Protein ID'])))
HHV_tiles = list(set(list(HHV['Protein ID'])))
vTR_tiles = [v for v in vTR_tiles if v not in HHV_tiles]
vTR_tiles = [v for v in vTR_tiles if v not in CoV_tiles]
vTR = vTR[vTR['Protein ID'].isin(vTR_tiles)]

vTR['Gene'] = vTR['Gene'].replace([np.nan], ['-'])
CoV['Gene'] = CoV['Gene'].replace([np.nan], ['-'])

# vTR_plot_dir = '../Individual Figure Panels/vTR_indivProteins_allTiles_300-dpi'
# CoV_plot_dir = '../Individual Figure Panels/CoV_indivProteins_allTiles_300-dpi'

vTR_plot_dir = '../Individual Figure Panels/vTR_indivProteins_allTiles_150-dpi'
CoV_plot_dir = '../Individual Figure Panels/CoV_indivProteins_allTiles_150-dpi'

# print('Files and paths reported')

In [5]:
# Author: Connor Ludwig
# Organization: Bintu Lab
# Date: 2/4/2023

# !{sys.executable} -m pip install fpdf
from fpdf import FPDF
import math

##### AUXILIARY FUNCTIONS #####
# function to calculate the x value needed to center an image
def center(width):
    page_width = 210
    margin = 10
    x = (page_width - width)/2
    return x


def percent_effect(df, row, effect, which_val):
    if effect == 'Activation':
        text = '%ON'
    elif effect == 'Repression':
        text = '%OFF'
    R1 = round(df.iloc[row, df.columns.get_loc('R1 Est. ' + text)], 1)
    R2 = round(df.iloc[row, df.columns.get_loc('R2 Est. ' + text)], 1)
    
    if which_val == 'lower':
        return str(min([R1, R2]))
    elif which_val == 'upper':
        return str(max([R1, R2]))
    
effect_text = {'Activation':'activated', 'Repression':'repressed'}

    
class PDF(FPDF):
    def __init__(self):
        super(PDF,self).__init__()
        self.figcount=1
        
    def header(self):
        self.set_font('Arial', 'B', 8)
        self.cell(0, 4, 'Other Tiling Screen Data from Ludwig, C.H. et al. 2023', 0, 0, 'C')
        self.ln(6)
        
    def footer(self):
        # Position at 1 cm from bottom
        self.set_y(-15)
        self.set_font('Arial', '', 8)
        # Page number
        self.cell(0, 10, str(self.page_no()+199), 0, 0, 'C')
#         self.cell(0, 10, 'Page ' + str(self.page_no()) + ' of {nb}', 0, 0, 'C')


##### CONSTRUCT PDF #####
w1 = 100 # tiling plot width
title_fontsize = 10 # page title font size
text_fontsize = 8 # all other text font size

pdf = PDF()
pdf.alias_nb_pages()

# for each tiling plot
for plot in os.listdir(vTR_plot_dir):
    # define path to the plot
    plot_path = os.path.join(vTR_plot_dir, plot)
    # extract information from plot file title
    protein, virus, uID = plot.split('_')[0:3]
    
    # if the protein (identified by UniProt ID) has a domain 
    if uID in list(vTR['Protein ID']):
        print('Adding information for %s %s (%s)' % (virus, protein, uID))
        gene = list(vTR.loc[vTR['Protein ID']==uID, 'Gene'])[0]
        family = list(vTR.loc[vTR['Protein ID']==uID, 'Protein Family'])[0]
        # make new page of PDF and create title cell
        pdf.add_page() 
        pdf.set_font('Arial', '', size=title_fontsize)
        pdf.cell(w=0, h=6, txt=(virus + ' ' + protein + ' (' + uID + ')'), ln=2, align='C')
        pdf.set_font('Arial', '', size=text_fontsize)
        pdf.cell(w=0, h=4, txt=('Gene: ' + gene + ' ; Protein Family: ' + family), ln=2, align='C')
        pdf.ln(4)

        # plot tiling plot
        print(' - adding tiling plot')
        pdf.image(plot_path, x=center(w1), w=w1)
        pdf.ln(2)
        pdf.line(10, pdf.y, 200, pdf.y)
        pdf.ln(4)
        
        # create temporary data frame with only protein info
        tempdf = vTR[vTR['Protein ID']==uID]
        
        # iterate over rows in dataframe and write domain information (extended and max)
        for i in range(len(tempdf)):
            # extended sequence info
            caption_vals_ext = (tempdf.iloc[i, tempdf.columns.get_loc('Effect')].lower(),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Extended Start')])),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Extended End')])),
                                tempdf.iloc[i, tempdf.columns.get_loc('Extended Sequence')])
            pdf.write(4, 'Extended %s domain from residues %s to %s: %s' % caption_vals_ext)
            pdf.ln(6)
        
            # max sequence info
            caption_vals_max = (tempdf.iloc[i, tempdf.columns.get_loc('Effect')].lower(),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Max Start')])),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Max End')])),
                                percent_effect(tempdf, i, tempdf.iloc[i, tempdf.columns.get_loc('Effect')], 'lower'),
                                percent_effect(tempdf, i, tempdf.iloc[i, tempdf.columns.get_loc('Effect')], 'upper'),
                                effect_text[tempdf.iloc[i, tempdf.columns.get_loc('Effect')]],                                
                                tempdf.iloc[i, tempdf.columns.get_loc('Max Sequence')])
            
            pdf.write(4, 'Max tile of %s domain from residues %s to %s (estimated %s%% to %s%% of cells %s): %s' % caption_vals_max)
            pdf.ln(8)
                    
            pdf.line(10, pdf.y, 200, pdf.y)
            pdf.ln(4)
                      
# save PDF
# pdf.output('../test_fpdf.pdf', 'F')
pdf.output('../Ludwig_2023_Supplement_vTR_Domain_Details.pdf', 'F')

print('Completed')

Adding information for BKPyV Agnoprotein (P14998)
 - adding tiling plot
Adding information for HFV BEL1 (P14353)
 - adding tiling plot
Adding information for SFV BEL1 (Q87042)
 - adding tiling plot
Adding information for CeHV1 DBP (Q805Z9)
 - adding tiling plot
Adding information for CeHV2 DBP (Q5Y0S3)
 - adding tiling plot
Adding information for HHV7 DBP (O56282)
 - adding tiling plot
Adding information for KSHV DBP (K0FHU3)
 - adding tiling plot
Adding information for HAdV12 E1A (P03259)
 - adding tiling plot
Adding information for HAdV40 E1A (P10541)
 - adding tiling plot
Adding information for HAdV4 E1A (P10407)
 - adding tiling plot
Adding information for HAdV5 E1A (P03255)
 - adding tiling plot
Adding information for HAdV7 E1A (P03256)
 - adding tiling plot
Adding information for HAdV9 E1A (Q9YLA0)
 - adding tiling plot
Adding information for HPV16 E2 (P03120)
 - adding tiling plot
Adding information for HAdV40 E4 (Q64857)
 - adding tiling plot
Adding information for HAdV9 E4 (P8

In [6]:
# Author: Connor Ludwig
# Organization: Bintu Lab
# Date: 2/4/2023

# !{sys.executable} -m pip install fpdf
from fpdf import FPDF
import math

##### AUXILIARY FUNCTIONS #####
# function to calculate the x value needed to center an image
def center(width):
    page_width = 210
    margin = 10
    x = (page_width - width)/2
    return x


def percent_effect(df, row, effect, which_val):
    if effect == 'Activation':
        text = '%ON'
    elif effect == 'Repression':
        text = '%OFF'
    R1 = round(df.iloc[row, df.columns.get_loc('R1 Est. ' + text)], 1)
    R2 = round(df.iloc[row, df.columns.get_loc('R2 Est. ' + text)], 1)
    
    if which_val == 'lower':
        return str(min([R1, R2]))
    elif which_val == 'upper':
        return str(max([R1, R2]))
    
effect_text = {'Activation':'activated', 'Repression':'repressed'}
    
    
class PDF(FPDF):
    def __init__(self):
        super(PDF,self).__init__()
        self.figcount=1
        
    def header(self):
        self.set_font('Arial', 'B', 8)
        self.cell(0, 4, 'Other Tiling Screen Data from Ludwig, C.H. et al. 2023', 0, 0, 'C')
        self.ln(6)
        
    def footer(self):
        # Position at 1 cm from bottom
        self.set_y(-15)
        self.set_font('Arial', '', 8)
        # Page number
        self.cell(0, 10, str(self.page_no()+285), 0, 0, 'C')
#         self.cell(0, 10, 'Page ' + str(self.page_no()) + ' of {nb}', 0, 0, 'C')


##### CONSTRUCT PDF #####
w1 = 100 # tiling plot width
title_fontsize = 10 # page title font size
text_fontsize = 8 # all other text font size

pdf = PDF()
pdf.alias_nb_pages()

# for each tiling plot
for plot in os.listdir(CoV_plot_dir):
    # define path to the plot
    plot_path = os.path.join(CoV_plot_dir, plot)
    # extract information from plot file title
    plot_components = plot.split('_')
    
    # if CoV protein was not derived from a polyprotein, process one way
    if len(plot_components) == 5:
        protein, virus, uID = plot_components[0:3]
    # else if CoV protein was derived from a polyprotein, process another way
    elif len(plot_components) == 6:
        protein, virus, uID1, uID2 = plot_components[0:4]
        uID = '_'.join(['|'.join(uID1.split('-')), uID2])
    
    # if the protein (identified by UniProt ID) has a domain 
    if uID in list(CoV['Protein ID']):
        print('Adding information for %s %s (%s)' % (virus, protein, uID))
        gene = list(CoV.loc[CoV['Protein ID']==uID, 'Gene'])[0]
        family = list(CoV.loc[CoV['Protein ID']==uID, 'Protein Family'])[0]
        # make new page of PDF and create title cell
        pdf.add_page() 
        pdf.set_font('Arial', '', size=title_fontsize)
        pdf.cell(w=0, h=6, txt=(virus + ' ' + protein + ' (' + uID + ')'), ln=2, align='C')
        pdf.set_font('Arial', '', size=text_fontsize)
        pdf.cell(w=0, h=4, txt=('Gene: ' + gene + ' ; Protein Family: ' + family), ln=2, align='C')
        pdf.ln(4)

        # plot tiling plot
        print(' - adding tiling plot')
        pdf.image(plot_path, x=center(w1), w=w1)
        pdf.ln(2)
        pdf.line(10, pdf.y, 200, pdf.y)
        pdf.ln(4)
        
        # create temporary data frame with only protein info
        tempdf = CoV[CoV['Protein ID']==uID]
        
        # iterate over rows in dataframe and write domain information (extended and max)
        for i in range(len(tempdf)):
            # extended sequence info
            caption_vals_ext = (tempdf.iloc[i, tempdf.columns.get_loc('Effect')].lower(),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Extended Start')])),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Extended End')])),
                                tempdf.iloc[i, tempdf.columns.get_loc('Extended Sequence')])
            pdf.write(4, 'Extended %s domain from residues %s to %s: %s' % caption_vals_ext)
            pdf.ln(6)
        
            # max sequence info
            caption_vals_max = (tempdf.iloc[i, tempdf.columns.get_loc('Effect')].lower(),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Max Start')])),
                                str(int(tempdf.iloc[i, tempdf.columns.get_loc('Max End')])),
                                percent_effect(tempdf, i, tempdf.iloc[i, tempdf.columns.get_loc('Effect')], 'lower'),
                                percent_effect(tempdf, i, tempdf.iloc[i, tempdf.columns.get_loc('Effect')], 'upper'),
                                effect_text[tempdf.iloc[i, tempdf.columns.get_loc('Effect')]],                                
                                tempdf.iloc[i, tempdf.columns.get_loc('Max Sequence')])
            pdf.write(4, 'Max tile of %s domain from residues %s to %s (estimated %s%% to %s%% of cells %s): %s' % caption_vals_max)
            
            pdf.ln(8)
                    
            pdf.line(10, pdf.y, 200, pdf.y)
            pdf.ln(4)
                      
# save PDF
# pdf.output('../test_fpdf.pdf', 'F')
pdf.output('../Ludwig_2023_Supplement_CoV_Domain_Details.pdf', 'F')

print('Completed')

Adding information for MERS-CoV E (K9N5R3)
 - adding tiling plot
Adding information for HCoV-OC43 HE (P30215)
 - adding tiling plot
Adding information for HCoV-HKU1-N5 I (Q0ZME2)
 - adding tiling plot
Adding information for SARS-CoV-2 M (P0DTC5)
 - adding tiling plot
Adding information for HCoV-NL63 NS3 (Q6Q1S1)
 - adding tiling plot
Adding information for HCoV-HKU1-N1 NSP10 (P0C6U3|PRO_0000338204)
 - adding tiling plot
Adding information for HCoV-HKU1-N1 NSP10 (P0C6X2|PRO_5000093342)
 - adding tiling plot
Adding information for HCoV-HKU1-N2 NSP10 (P0C6U4|PRO_0000338216)
 - adding tiling plot
Adding information for HCoV-HKU1-N2 NSP10 (P0C6X3|PRO_0000297782)
 - adding tiling plot
Adding information for HCoV-HKU1-N5 NSP10 (P0C6U5|PRO_0000338228)
 - adding tiling plot
Adding information for HCoV-HKU1-N5 NSP10 (P0C6X4|PRO_0000297797)
 - adding tiling plot
Adding information for MERS-CoV NSP10 (K9N638|PRO_0000422463)
 - adding tiling plot
Adding information for MERS-CoV NSP10 (K9N7C7|PRO_00

### END