In [14]:
import pandas as pd
import re
import pytz
import json


In [15]:
# --- 1. Load Data ---

# Load the file names
# Make sure 'file_list.txt' is in the same directory or provide the full path
try:
    with open('file_list.txt', 'r', encoding='utf-8') as config_file:
        file_names = config_file.read().splitlines()
except FileNotFoundError:
    print("Error: 'file_list.txt' not found. Please ensure it exists.")
    exit() # Exit if the file list is missing


In [16]:

# Regex pattern to match the log entries
pattern = r'\[(.*?)\] (.*?): (.*)'

datalist = []
stream_count = 0
# Assuming 'data' subdirectory exists relative to the script location
data_dir = "data\\"

for file in file_names:
    full_path = data_dir + file
    try:
        with open(full_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                match = re.match(pattern, line)
                if match:
                    date_str, user, message = match.groups()
                    # Append raw data first, convert date later for efficiency
                    datalist.append([date_str, user, message, stream_count])
        stream_count += 1
    except FileNotFoundError:
        print(f"Warning: File '{full_path}' not found. Skipping.")
    except Exception as e:
        print(f"Error reading file '{full_path}': {e}")



In [17]:

# --- 2. Create DataFrame and Initial Processing ---

if not datalist:
    print("Error: No data was loaded. Exiting.")
    exit()

# Create a DataFrame
data = pd.DataFrame(datalist, columns=["date_str", "user", "message", "stream"])


In [18]:

# Convert 'date_str' to datetime format (handle potential errors)
data['date'] = pd.to_datetime(data['date_str'], errors='coerce')


In [19]:
def convert_utc_to_cet(df, date_column='date'):
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df


In [20]:
data = convert_utc_to_cet(data)

In [21]:

# Drop rows where date conversion failed
original_rows = len(data)
data.dropna(subset=['date'], inplace=True)
if len(data) < original_rows:
    print(f"Warning: Dropped {original_rows - len(data)} rows due to invalid date format.")


In [22]:


# --- 3. User Replacements / Cleaning ---
# (Using .loc for potentially better performance on large datasets)
replace_map = {
    "Banties1g": "banties_x",
    "banties1g": "banties_x",
    "chili_poe": "chili_con_bacon",
    "CHILI_POE": "chili_con_bacon",
    "chili_conbacon": "chili_con_bacon",
    "Wirelesss_": "W1r3lesss",
    "treklul": "trek44_",
    "ttrek_": "trek44_",
    "TriplesingleJ": "TripleSingleJames",
    "uuccugr": "uwu_cougar",
    "uuccugr": "uuccugr_"
}
data["user"] = data["user"].replace(replace_map)


In [23]:
from collections import defaultdict

# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

# Display the results
for lower, variants in duplicate_users.items():
    print(f"{lower}: {sorted(variants)}")

tilch: ['Tilch', 'tilch']
gyllenc: ['Gyllenc', 'gyllenc']
roomunder: ['ROOMUNDER', 'roomunder']
notbat: ['NOTBAT', 'notbat']
nurm1_0_0: ['NURM1_0_0', 'NuRM1_0_0']
iamconvinc3d: ['Iamconvinc3d', 'iamconvinc3d']
wisez__: ['wiseZ__', 'wisez__']
r4rrixd: ['R4RRIXD', 'r4rrixd']
obungakung: ['OBUNGAKUNG', 'obungakung']
chimphailpk: ['ChimpHailpk', 'chimphailpk']
xfrell1: ['Xfrell1', 'xfrell1']
lopilop1: ['Lopilop1', 'lopilop1']
trodainus: ['Trodainus', 'trodainus']
kavnera: ['KAVNERA', 'kavnera']
noxiex4charity12: ['Noxiex4Charity12', 'noxiex4charity12']
macbless: ['MacBless', 'macbless']
anos87w: ['Anos87w', 'anos87w']
vladyslav_h: ['VLADYSLAV_H', 'vladyslav_h']
ars_rugal: ['ARS_RUGAL', 'ars_rugal']
artemioni4: ['Artemioni4', 'artemioni4']
torst3ncs: ['tORST3NCS', 'tORST3Ncs']
adel5244: ['Adel5244', 'adel5244']
f1zzcsgo: ['F1ZZcsgo', 'f1zzcsgo']
bubrekht: ['BuBRekht', 'BuBrekht', 'bubrekht']
martimmendez: ['MartimMendeZ', 'martimmendez']
xvenumz_97: ['xVeNuMz_97', 'xvenumz_97']
alsobres2: [

In [24]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [26]:
# Convert 'date' to string for JSON serialization
data["date"] = data["date"].astype(str)

# Drop 'date_str' if it exists
if 'date_str' in data.columns:
    data = data.drop(columns=['date_str'])

# Reset index to get message IDs (0,1,2,...)
data = data.reset_index(drop=True)

# Create messages dict WITHOUT 'stream'
messages = data.drop(columns=['stream']).to_dict(orient="index")

# Create stream summary: first and last message ID per stream
stream_summary = {}
for stream_id in data['stream'].unique():
    stream_data = data[data['stream'] == stream_id]
    first_id = stream_data.index.min()
    last_id = stream_data.index.max()
    stream_summary[int(stream_id)] = {
        "first_message_id": int(first_id),
        "last_message_id": int(last_id)
    }

# Combine into final JSON structure
final_json = {
    "messages": messages,
    "streams": stream_summary
}

# Save as JSON
with open("data.json", "w") as f:
    json.dump(final_json, f, indent=2)

print("JSON file created with message data and stream summary.")

JSON file created with message data and stream summary.
