In [5]:
import pandas as pd
import re
import pytz

# Read the list of filenames from the configuration file
with open('../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

In [6]:
data['date'] = pd.to_datetime(data['date'])

In [7]:
def convert_utc_to_cet(df, date_column='date'):
    """
    Convert UTC timestamps to Central European Time (CET/CEST) with proper DST handling
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the date column
    date_column (str): Name of the column containing UTC timestamps
    
    Returns:
    pd.DataFrame: DataFrame with converted timestamps
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df


In [8]:
data = convert_utc_to_cet(data)

In [9]:
# Filter messages containing "ome44"
filtered_data = data[data["message"].str.contains("ome44", na=False)]

# Count occurrences per user
user_counts = filtered_data.groupby("user")["message"].count()

# Get the top 5 users
top_users = user_counts.nlargest(5)

print(top_users)

user
elluiti         5503
Typhu25         3190
stan_iv4        1736
ifanbenmezda    1502
ttrek_          1464
Name: message, dtype: int64


In [10]:
# Filter messages containing "ome44"
filtered_data2 = data[data["message"].str.contains("LOL", na=False)]

# Count occurrences per user
user_counts = filtered_data2.groupby("user")["message"].count()

# Get the top 5 users
top_users = user_counts.nlargest(5)

print(top_users)

user
elluiti         1746
balintboss      1065
Martin_Gales     936
CrazeE420xd      699
fuzok4           632
Name: message, dtype: int64


In [11]:
# Filter messages containing "ome44"
filtered_data4 = data[data["message"].str.contains("speed1", na=False)]

# Count occurrences per user
user_counts = filtered_data4.groupby("user")["message"].count()

# Get the top 5 users
top_users = user_counts.nlargest(5)

print(top_users)

user
devilbabymamadrama    323
lajosbarnabas         180
schiken44              90
SchiKen44              88
desireOY               77
Name: message, dtype: int64


In [13]:
# Filter messages containing "ome44"
filtered_data5 = data[data["message"].str.contains("qq", na=False)]

# Count occurrences per user
user_counts = filtered_data5.groupby("user")["message"].count()

# Get the top 5 users
top_users = user_counts.nlargest(5)

print(top_users)

user
Ivana_10        829
elluiti         759
stan_iv4        688
ifanbenmezda    487
CrazeE420xd     298
Name: message, dtype: int64
