# ATLAS Samples

## Import modules

In [None]:
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from HtautauRegression.dataset import H5Dataset
from HtautauRegression.helpers import plot_mass, find_indices, find_sample_number

## Load samples

Have both lephad decays and hadhad decays available 

In [None]:
region = "HadHad"
#dname = f"ProjectSamples/{region}"
dname = f"/bundle/data/ATLAS/gwilliam/ProjectSamples/h5/ATLAS/{region}"
samples = [f"{dname}/hadhad_V5.h5", f"{dname}/ttbar_nonallhad_V5.h5", f"{dname}/zjets_V5.h5", f"{dname}/ttbar_dilep_V5.h5"]
data = H5Dataset(samples, target_name = "Mtautau") #, size = 25000)

### Explore dataset

List of input feature names

In [None]:
print(data.feature_names)

Input data

In [None]:
X = data.X()
print(X.shape)
X

In [None]:
itau1pt = data.feature_names.index("Tau1_Pt")

plt.figure()
plt.title("Distribution of first input feature")
plt.xlabel(r"$p_T (\tau_1)$ [GeV]")
plt.ylabel("Arbitrary Units")
plt.hist(X[:, itau1pt], bins = np.linspace(0, 2000, 25), fill = None, 
         histtype = "step", density = True)
plt.yscale('log')
plt.show()

Output target mass for all samples

In [None]:
y = data.y()
print (y.shape)
y

In [None]:
plt.figure()
plt.title("Distribution of output masses for all samples")
plt.xlabel(r"$m_{\tau\tau}$ [GeV]")
plt.ylabel("Arbitrary Units")
plt.hist(y, bins = np.linspace(0, 4000, 60), fill = None, 
         histtype = "step", density = True)
plt.yscale('log')
plt.show()

Auxilary data, including a label for the individual samples and the ATLAS MMC and CMS SVFIt to compare to 

In [None]:
print(data.aux_labels())
immc = data.aux_labels().index("MMMC")
immcstat = data.aux_labels().index("MMCStatus")
isample = data.aux_labels().index("sample")
aux = data.aux()
aux

Plot the ATLAS and CMS masses versus the truth for one mass sample

In [None]:
ihh = find_indices(samples, aux, data.aux_labels(), "hadhad")

plot_mass(None, aux[ihh, immc], mtrue=y[ihh], 
          title = "Mass (train, 125 GeV)", bins=np.linspace(50, 200, 75), true_scale = 0.1)

### Get individual data events
Data for first event

In [None]:
X0, y0 = data[0]
print(X0)
print(y0)