In [1]:
%matplotlib notebook
import bz2, os, time
from os import path
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.dates as mdate
from scipy.optimize import curve_fit
from scipy.ndimage import gaussian_filter

In [3]:
NBINS    = 32  # 32 bines por pulso
ADCBITS  = 14  # 14 bits
TIME_SEP = 8e-9   # 8 ns por punto
BASELINE = 0   # linea de base
channels = [1,2]

data_dir = '/eos/user/d/dcazarra/datos_panchito/' # Directorio donde se encuentran los datos
#data_dir = '/eos/home-i00/d/dmerizal/SWAN_projects/LAGO/Datos/'
#data_dir = '/eos/user/d/dmerizal/SWAN_projects/LAGO/Datos/' 
plot_dir = 'plots' # Directorio donde se guardaran las graficas

## Extracting and saving data as .csv files to reduce the space in disk that original .dat files occupy. Then we can have enough memory to analyse data in python

In [5]:
#dates and hours of the corresponding datasets 
dates = ['2024_05_01','2024_05_02', '2024_05_03', '2024_05_04', '2024_05_05','2024_05_06', '2024_05_07', '2024_05_08', '2024_05_09', '2024_05_10', '2024_05_11', '2024_05_12', '2024_05_13']
hours = ['00','01','02','03', '04','05','06', '07','08','09','10','11','12','13','14','15','16', '17', '18','19','20','21','22','23']

# Initialize lists to store the data
for date in dates:
    temperature = []
    pressure = []
    counts = []
    t_counter = []
    print(f'Analysing data from {date}...')
    # Iterate through each hour and read the data
    for hour in hours:
        filename = f't600_nogps_{date}_{hour}h00.dat'
        file_path = os.path.join(data_dir, filename)

        # Check if the file exists
        if not os.path.exists(file_path):
            print(f"File {filename} not found.")
            continue

        # Load data from the file
        try:
            with open(file_path, 'r') as file:
                for line in file:
                    # Check for temperature lines
                    if line.startswith('# x t'):
                        temperature.append(float(line.strip().split()[3]))
                    elif line.startswith('# x p'):
                    # Check for pressure lines
                        pressure.append(float(line.strip().split()[3]))
                    # Check for counter (internal trigger clock)
                    elif line.startswith('# t'):
                        t_counter.append(int(line.strip().split()[3]))
                    # Check for counter lines (detection counter)
                    elif line.startswith('# c'):
                        counts.append(int(line.strip().split()[2]))

        except ValueError as e:
            print(f"Error processing file {filename}: {e}")
    
    df_trigger = pd.DataFrame({
        'tcounter': t_counter, #number of trigger cyles
        'counts': counts
    })

    ## Display the DataFrame
    df_trigger['counts'] = df_trigger['counts'].diff().fillna(0)
    df_trigger['tdiff'] = df_trigger['tcounter'].diff().fillna(0)

    print(df_trigger)
    
    
    # Define the clock frequency and maximum tcycles per second
    clock_frequency = 125e6  # 125 MHz
    tcycles_per_second = clock_frequency

    # Identify wrap-around points (points where the clock counter is restarted)
    wrap_around_points = df_trigger['tdiff']  < 0

    count_trues = sum(wrap_around_points)
    print(f'wrap around points {count_trues}')  # Output should be 3600*N-1  (N is the number of hours)

    # Adjust for wrap-around points
    cumulative_wrap_around = wrap_around_points.cumsum()

    df_trigger['tcycles_adj'] = df_trigger['tdiff'] + cumulative_wrap_around * tcycles_per_second

    # Calculate time in seconds
    df_trigger['time'] = np.floor(df_trigger['tcycles_adj'] / tcycles_per_second).astype(int)+1


    # Group the 'counts' based on 'time'
    grouped = df_trigger.groupby('time')['counts'].apply(list)

    # Convert the result to a list of lists (subarrays)
    results = np.array(grouped.tolist(), dtype=object)
    del df_trigger
    # Each subarray has the counts on each second. In this case, there should be roughly more than 230 hits in each one (mean value is 350 per second)
    # If less than 230 it could mean the electronics made a mistake, therefore that second will not be taken into account 
    filtered_results = [sum(subarray ) for subarray in results if sum(subarray) > 230 ]

    # Convert the filtered result to a NumPy array with dtype=object
    counts = np.array(filtered_results, dtype=float).flatten()
    pressure = np.array(pressure, dtype=float).flatten()
    temperature = np.array(temperature, dtype=float).flatten()
    
    # Making sure arrays have same lenght 
    # It is common the counts to have more elements than pressure, e.g. one hour of data taking should have 3600 measurements of pressure
    min_length = min(len(counts), len(pressure))
    counts = counts[:min_length]
    print(f'Length is {min_length}')
     
    df_raw = pd.DataFrame({
        'counts': counts, #number of detections
        'pressure': pressure,
        'temperature': temperature
    })
    ### save as a .csv 
    df_raw.to_csv(f'raw_data_{date}.csv', index=False)
    print(f'Saved as raw_data_{date}.csv')

Analysing data from 2024_05_01...
           tcounter  counts      tdiff
0            572656     0.0        0.0
1           1404534     1.0   831878.0
2           1509143     1.0   104609.0
3           1808749     1.0   299606.0
4           1860276     1.0    51527.0
...             ...     ...        ...
30136080  121962205     1.0    40009.0
30136081  122475722     1.0   513517.0
30136082  122818120     1.0   342398.0
30136083  123469879     1.0   651759.0
30136084  124826557     1.0  1356678.0

[30136085 rows x 3 columns]
wrap around points 86405
Length is 86400
Saved as raw_data_2024_05_01.csv


In [7]:
# Loading and check reprocessed datasets
# List of dates corresponding to CSV filenames
dates = ['2024_05_01', '2024_05_02','2024_05_03', '2024_05_04','2024_05_05', '2024_05_06', '2024_05_07', '2024_05_08', '2024_05_09', '2024_05_10', '2024_05_11', '2024_05_12', '2024_05_13']

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through the dates and read each CSV file
for date in dates:
    filename = f'raw_data_{date}.csv'  # Construct the filename dynamically
    df = pd.read_csv(filename)  # Read the CSV file
    dataframes.append(df)       # Append the DataFrame to the list

# Concatenate all the DataFrames into one
final_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(final_df)

         counts  pressure  temperature
0         390.0    766.82         22.9
1         324.0    766.73         22.9
2         363.0    766.76         23.0
3         330.0    766.79         22.9
4         365.0    766.82         22.9
...         ...       ...          ...
1123195   379.0    766.92         22.5
1123196   326.0    767.05         22.5
1123197   377.0    767.01         22.5
1123198   368.0    766.94         22.5
1123199   354.0    766.94         22.5

[1123200 rows x 3 columns]
