# Save the MC b-tag Efficiency, $\epsilon\left(p_T,\ \eta\right)$ for each Subjet and Store them in .csv Files to Import into `TTbarResProccesor*`

## 1.) Define a function that makes 2D histograms of efficiency and import the coffea output(s)

In [None]:
import os
from coffea import hist
from coffea import util
import numpy as np
import itertools
import mplhep as hep
from hist.intervals import ratio_uncertainty
import matplotlib.colors as colors
import pandas as pd

In [None]:
def plotratio2d(numerator, denominator, ax=None, cmap='Blues', cbar=True):
    NumeratorAxes = numerator.axes()
    DenominatorAxes = denominator.axes()
    
    # integer number of bins in this axis #
    NumeratorAxis1_BinNumber = NumeratorAxes[0].size - 3 # Subtract 3 to remove overflow
    NumeratorAxis2_BinNumber = NumeratorAxes[1].size - 3
    
    DenominatorAxis1_BinNumber = DenominatorAxes[0].size - 3 
    DenominatorAxis2_BinNumber = DenominatorAxes[1].size - 3 
    
    if(NumeratorAxis1_BinNumber != DenominatorAxis1_BinNumber 
       or NumeratorAxis2_BinNumber != DenominatorAxis2_BinNumber):
        raise Exception('Numerator and Denominator axes are different sizes; Cannot perform division.')
    else:
        Numerator = numerator.to_hist()
        Denominator = denominator.to_hist()

        ratio = Numerator / Denominator.values()

#         err_up, err_down = ratio_uncertainty(Numerator.values(), Denominator_fixed, 'poisson-ratio')

#         labels = []
#         for ra, u, d in zip(ratio.values().ravel(), err_up.ravel(), err_down.ravel()):
#             ra, u, d = f'{ra:.2f}', f'{u:.2f}', f'{d:.2f}'
#             st = '$'+ra+'_{-'+d+'}^{+'+u+'}$'
#             labels.append(st)
#         labels = np.array(labels).reshape(NumeratorAxis1_BinNumber,NumeratorAxis2_BinNumber)
        
        return hep.hist2dplot(ratio, ax=ax, cmap=cmap, norm=colors.Normalize(0.,1.), cbar=cbar)

### Define a function to make directories (if the directory does not already exist)

In [None]:
def mkdir_p(mypath):
    '''Creates a directory. equivalent to using mkdir -p on the command line'''

    from errno import EEXIST
    from os import makedirs,path

    try:
        makedirs(mypath)
    except OSError as exc: # Python >2.5
        if exc.errno == EEXIST and path.isdir(mypath):
            pass
        else: raise

In [None]:
def DoesDirectoryExist(mypath): #extra precaution (Probably overkill...)
    '''Checks to see if Directory exists before running mkdir_p'''
    import os.path
    from os import path
    
    if path.exists(mypath):
        pass
    else:
        mkdir_p(mypath)

In [None]:
maindirectory = os.getcwd()

### Upload coffea files
Store the coffea output files in a dictionary of `Datasets` to loop through

In [None]:
dir = 'CoffeaOutputs/UnweightedOutputs/'

QCD_unweighted = util.load(dir+'TTbarResCoffea_QCD_unweighted_output.coffea')
TTbar_unweighted = util.load(dir+'TTbarResCoffea_TTbar_unweighted_output.coffea')
RSGluon1000_unweighted = util.load(dir+'TTbarResCoffea_RSGluon1000_unweighted_output.coffea')

In [None]:
Datasets = {
    'QCD': QCD_unweighted,
    'TTbar': TTbar_unweighted,
    'RSGluon1000': RSGluon1000_unweighted
}

In [None]:
PlotWithLargeBins = True

### Make the list of subjets to loop through

In [None]:
list_of_subjets = ['s01', 's02', 's11', 's12']

## 2.) Loop through each subjet, put numerator and denominator in `plotratio2d` and make and save pandas dataframe as .csv file
The name of file will correspond to the MC dataset `XX_` and subjet in question `sXX_` and ending in `Xtageff.csv`

## b-tag lists

In [None]:
SaveDirectory = maindirectory + '/FlavorTagEfficiencies_NoAnacat/btagEfficiencyTables/'
DoesDirectoryExist(SaveDirectory)
if PlotWithLargeBins == True:
    binwidth = '_largerbins'
    filename_bw = '_large_bins'
else:
    binwidth = ''
    filename_bw = ''
for dataset,output in Datasets.items():
    for subjet in list_of_subjets:
        
        b_eff_numerator = output['b_eff_numerator_' + subjet + binwidth].integrate('dataset', dataset)
        b_eff_denominator = output['b_eff_denominator_' + subjet + binwidth].integrate('dataset', dataset)

        b_eff = plotratio2d(b_eff_numerator, b_eff_denominator) #ColormeshArtists object

        b_eff_data = b_eff[0].get_array().data # This is what goes into pandas dataframe

        # ---- Define pt and eta bins from the numerator or denominator hist objects ---- #
        pt_bins = []
        eta_bins = []

        for iden in b_eff_numerator.identifiers('subjetpt'):
            pt_bins.append(iden)
        for iden in b_eff_numerator.identifiers('subjeteta'):
            eta_bins.append(iden)

        # ---- Define the Efficiency List as a Pandas Dataframe ---- #
        EfficiencyList = pd.DataFrame(
                            b_eff_data,
                            pd.MultiIndex.from_product( [pt_bins, eta_bins], names=['pt', 'eta'] ),
                            ['efficiency']
                        )

        # ---- Save the Efficiency List as .csv ---- #
        filename = dataset + '_' + subjet + '_btageff' + filename_bw + '.csv'
        EfficiencyList.to_csv(SaveDirectory+filename)
        print('\nSaved ' + filename)

        # ---- Temporary Test ---- #
#         if subjet == 's01':
#             print(EfficiencyList)

## c-tag lists

In [None]:
SaveDirectory = maindirectory + '/FlavorTagEfficiencies_NoAnacat/ctagEfficiencyTables/'
DoesDirectoryExist(SaveDirectory)
if PlotWithLargeBins == True:
    binwidth = '_largerbins'
    filename_bw = '_large_bins'
else:
    binwidth = ''
    filename_bw = ''
for dataset,output in Datasets.items():
    for subjet in list_of_subjets:
        
        c_eff_numerator = output['c_eff_numerator_' + subjet + binwidth].integrate('dataset', dataset)
        c_eff_denominator = output['c_eff_denominator_' + subjet + binwidth].integrate('dataset', dataset)

        c_eff = plotratio2d(c_eff_numerator, c_eff_denominator, cmap='Greens') #ColormeshArtists object

        c_eff_data = c_eff[0].get_array().data # This is what goes into pandas dataframe

        # ---- Define pt and eta bins from the numerator or denominator hist objects ---- #
        pt_bins = []
        eta_bins = []

        for iden in b_eff_numerator.identifiers('subjetpt'):
            pt_bins.append(iden)
        for iden in b_eff_numerator.identifiers('subjeteta'):
            eta_bins.append(iden)

        # ---- Define the Efficiency List as a Pandas Dataframe ---- #
        EfficiencyList = pd.DataFrame(
                            c_eff_data,
                            pd.MultiIndex.from_product( [pt_bins, eta_bins], names=['pt', 'eta'] ),
                            ['efficiency']
                        )

        # ---- Save the Efficiency List as .csv ---- #
        filename = dataset + '_' + subjet + '_ctageff' + filename_bw + '.csv'
        EfficiencyList.to_csv(SaveDirectory+filename)
        print('\nSaved ' + filename)

        # ---- Temporary Test ---- #
#         if subjet == 's01':
#             print(EfficiencyList)

## udsg-tag lists

In [None]:
SaveDirectory = maindirectory + '/FlavorTagEfficiencies_NoAnacat/udsgtagEfficiencyTables/'
DoesDirectoryExist(SaveDirectory)
if PlotWithLargeBins == True:
    binwidth = '_largerbins'
    filename_bw = '_large_bins'
else:
    binwidth = ''
    filename_bw = ''
for dataset,output in Datasets.items():
    for subjet in list_of_subjets:
        
        udsg_eff_numerator = output['udsg_eff_numerator_' + subjet + binwidth].integrate('dataset', dataset)
        udsg_eff_denominator = output['udsg_eff_denominator_' + subjet + binwidth].integrate('dataset', dataset)

        udsg_eff = plotratio2d(udsg_eff_numerator, udsg_eff_denominator, cmap='Reds') #ColormeshArtists object

        udsg_eff_data = udsg_eff[0].get_array().data # This is what goes into pandas dataframe

        # ---- Define pt and eta bins from the numerator or denominator hist objects ---- #
        pt_bins = []
        eta_bins = []

        for iden in b_eff_numerator.identifiers('subjetpt'):
            pt_bins.append(iden)
        for iden in b_eff_numerator.identifiers('subjeteta'):
            eta_bins.append(iden)

        # ---- Define the Efficiency List as a Pandas Dataframe ---- #
        EfficiencyList = pd.DataFrame(
                            c_eff_data,
                            pd.MultiIndex.from_product( [pt_bins, eta_bins], names=['pt', 'eta'] ),
                            ['efficiency']
                        )

        # ---- Save the Efficiency List as .csv ---- #
        filename = dataset + '_' + subjet + '_udsgtageff' + filename_bw + '.csv'
        EfficiencyList.to_csv(SaveDirectory+filename)
        print('\nSaved ' + filename)

        # ---- Temporary Test ---- #
#         if subjet == 's01':
#             print(EfficiencyList)