In [1]:
import pandas as pd
import re
import pytz

# Read the list of filenames from the configuration file
with open('../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])
data['date'] = pd.to_datetime(data['date'])

def convert_utc_to_cet(df, date_column='date'):
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df

data = convert_utc_to_cet(data)

data_length = len(data)
chunk_size = data_length // 4

# Split the data into 4 chunks
first_quarter = data.iloc[:chunk_size]
second_quarter = data.iloc[chunk_size : 2 * chunk_size]
third_quarter = data.iloc[2 * chunk_size : 3 * chunk_size]
# The last chunk takes the remaining data to ensure all rows are included, 
# especially if the total length isn't perfectly divisible by 4.
fourth_quarter = data.iloc[3 * chunk_size :] 

# Save them as separate files
first_quarter.to_json("data_part1.json", orient="records", date_format="iso")
second_quarter.to_json("data_part2.json", orient="records", date_format="iso")
third_quarter.to_json("data_part3.json", orient="records", date_format="iso")
fourth_quarter.to_json("data_part4.json", orient="records", date_format="iso")
