In [1]:
# import tensorflow as tf
# devices = tf.config.list_physical_devices()
# print(devices)
# tf.debugging.set_log_device_placement(True)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
data_mc_Sci = pd.read_csv("data/ScintilationR.csv", sep=",", index_col=False)
data_mc_Che = pd.read_csv("data/CherenkovR.csv", sep=",", index_col=False)

In [3]:
data_mc_Sci


Unnamed: 0,Event,numPE,cerenkov,scintillator,x,y,z,hitTime,charge
0,2,47,0,1083,-1070.369586,304.893467,656.813827,200247.091915,2.501288
1,2,47,0,1083,-1070.369586,304.893467,656.813827,200172.125130,1.453304
2,2,47,0,1083,-1070.369586,304.893467,656.813827,200173.533723,0.794884
3,2,47,0,1083,-1070.369586,304.893467,656.813827,200189.683552,1.960613
4,2,47,0,1083,-1070.369586,304.893467,656.813827,200352.268492,1.193643
...,...,...,...,...,...,...,...,...,...
40316883,999999,54,0,1118,-848.560424,333.787248,918.337628,35723.752572,0.629647
40316884,999999,54,0,1118,-848.560424,333.787248,918.337628,35879.133076,0.105423
40316885,999999,54,0,1118,-848.560424,333.787248,918.337628,35712.082710,0.751730
40316886,999999,54,0,1118,-848.560424,333.787248,918.337628,35722.018461,0.063745


In [4]:
def fix_the_column_name(df):
  N = 9
  # Select first N columns
  first_n_column  = df.iloc[: , :N]
  first_n_column.columns = ["Event",'numPE','cherenkov','scintilator','X','Y','Z','hitTime','charge']

  return first_n_column

In [5]:
def data_prepare(df):
    #only get the datra if numPE >4
    new_df = df[(df.numPE > 4)]
    log_hitTime = np.log(new_df['hitTime'])
    new_df['log_hitTime'] = log_hitTime
    new_df['sum_charge'] = new_df['charge'].groupby(new_df['Event']).transform('sum')
    new_df['avg_hitTime'] = new_df['log_hitTime'].groupby( new_df['Event']).transform('mean')
    new_df['avg_numPE'] = new_df['numPE'].groupby( new_df['Event']).transform('mean')
    new_df['avg_X'] = new_df['X'].groupby( new_df['Event']).transform('mean')
    new_df['avg_Y'] = new_df['Y'].groupby( new_df['Event']).transform('mean')
    new_df['avg_Z'] = new_df['Z'].groupby( new_df['Event']).transform('mean')
    columns = ['X','Y','Z','numPE','hitTime','sum_charge']
    selected_df = new_df[columns]
    return  selected_df 
    

In [6]:
fixed_Che =  fix_the_column_name(data_mc_Che)
fixed_Sci = fix_the_column_name(data_mc_Sci)


In [8]:
fixed_Sci.head(5)

Unnamed: 0,Event,numPE,cherenkov,scintilator,X,Y,Z,hitTime,charge
0,2,47,0,1083,-1070.369586,304.893467,656.813827,200247.091915,2.501288
1,2,47,0,1083,-1070.369586,304.893467,656.813827,200172.12513,1.453304
2,2,47,0,1083,-1070.369586,304.893467,656.813827,200173.533723,0.794884
3,2,47,0,1083,-1070.369586,304.893467,656.813827,200189.683552,1.960613
4,2,47,0,1083,-1070.369586,304.893467,656.813827,200352.268492,1.193643


In [9]:
df_mc_Sci = data_prepare(fixed_Sci)
df_mc_Che = data_prepare(fixed_Che)

In [10]:
df_mc_Sci.head()

Unnamed: 0,X,Y,Z,numPE,hitTime,sum_charge
0,-1070.369586,304.893467,656.813827,47,200247.091915,40.624912
1,-1070.369586,304.893467,656.813827,47,200172.12513,40.624912
2,-1070.369586,304.893467,656.813827,47,200173.533723,40.624912
3,-1070.369586,304.893467,656.813827,47,200189.683552,40.624912
4,-1070.369586,304.893467,656.813827,47,200352.268492,40.624912


In [11]:
df_Sci = df_mc_Sci.drop_duplicates(subset=['X','Y','Z','numPE','sum_charge'])
df_Che = df_mc_Che.drop_duplicates(subset=['X','Y','Z','numPE','sum_charge'])

In [12]:
df_Che.head()

Unnamed: 0,X,Y,Z,numPE,hitTime,sum_charge
0,1045.034103,-647.371092,75.006625,7,525029.012788,12.610506
7,-1116.76022,995.255628,900.412821,13,132657.881561,9.061204
20,-98.681725,-830.029833,-1028.904557,20,304687.850352,19.106456
40,869.949584,106.380615,521.148411,12,164840.708827,8.037652
52,510.675472,-572.991312,221.277887,10,67551.754558,7.331653


In [13]:
def data_tag(df, isCherenkov=True):
    new_df = df[(df.numPE > 4)]
    
    #create a label 1 for Cherenkov 0 for Scintilator
    if isCherenkov:
        new_df['label'] = 1
    else:
        new_df['label'] = 0
    
    final_df = new_df
    #print( final_df) 
    return  final_df 

In [14]:
Cherenkov = data_tag(df=df_Che,isCherenkov=True)

In [15]:
Scintilator= data_tag(df=df_Sci, isCherenkov=False)

In [16]:
df = pd.concat([Scintilator, Cherenkov], axis=0)

In [17]:
from sklearn.utils import shuffle
df_shuffle = shuffle(df)
df_shuffle.to_csv('data/data_for_training_R.csv', index=False)

In [19]:
data = pd.read_csv("data/data_for_training_R.csv")
data.head(20)

Unnamed: 0,X,Y,Z,numPE,hitTime,sum_charge,label
0,182.366782,-777.074274,-1381.321834,9,57550.822203,8.605796,1
1,106.734165,-870.837915,-916.024496,58,262079.481191,59.040325,0
2,477.246193,-941.066391,-663.665604,28,171013.972746,27.888703,1
3,478.263912,1169.28622,-741.678509,50,114589.806122,41.507286,1
4,-1123.25324,681.844907,55.041671,31,11416.4089,28.457054,0
5,-452.250455,200.052127,393.400191,68,70857.122957,62.613457,0
6,253.158665,830.509675,1135.729862,15,33555.233358,13.646443,1
7,-117.744857,818.764837,-663.299288,5,831200.165957,3.211913,1
8,320.064522,232.498429,-70.244455,17,16113.796286,18.393169,1
9,565.778147,-1122.979873,-263.710241,14,122798.176791,9.477048,1


In [81]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394968 entries, 0 to 394967
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   avg_X        394968 non-null  float64
 1   avg_Y        394968 non-null  float64
 2   avg_Z        394968 non-null  float64
 3   avg_numPE    394968 non-null  float64
 4   log_hitTime  394968 non-null  float64
 5   sum_charge   394968 non-null  float64
 6   label        394968 non-null  int64  
dtypes: float64(6), int64(1)
memory usage: 21.1 MB
