In [19]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from collections import defaultdict
import numpy as np
import pytz
import re

In [20]:

# Read the list of filenames from the configuration file
with open('../../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

In [21]:
data['date'] = pd.to_datetime(data['date'])

In [22]:
def convert_utc_to_cet(df, date_column='date'):
    """
    Convert UTC timestamps to Central European Time (CET/CEST) with proper DST handling
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the date column
    date_column (str): Name of the column containing UTC timestamps
    
    Returns:
    pd.DataFrame: DataFrame with converted timestamps
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df


In [23]:
data = convert_utc_to_cet(data)

In [24]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
data["user"] = data["user"].replace("chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("CHILI_POE", "chili_con_bacon")
data["user"] = data["user"].replace("Chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("chili_conbacon", "chili_con_bacon")
data["user"] = data["user"].replace("Wirelesss_", "W1r3lesss")
data["user"] = data["user"].replace("treklul", "trek44_")
data["user"] = data["user"].replace("ttrek_", "trek44_")
data["user"] = data["user"].replace("trek_x", "trek44_")
data["user"] = data["user"].replace("TriplesingleJ", "TripleSingleJames")
data["user"] = data["user"].replace("uwu_cougar", "uuccugr")
data["user"] = data["user"].replace("uuccugr_","uuccugr")
data["user"] = data["user"].replace("StanIV4_", "stan_iv4")
data["user"] = data["user"].replace("Muuskie2", "Muuskie")
data["user"] = data["user"].replace("nishad_more1311", "nishad13")
data["user"] = data["user"].replace("softarballt", "softarr")
data["user"] = data["user"].replace("softarballtt23", "softarr")
data["user"] = data["user"].replace("lajosbarnabas", "lajoss__")


In [25]:
# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

In [26]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [27]:
# 2. aggregate data by user to find their 'stats'
# We need: 
#   - Total message count (to check the 1000+ condition)
#   - Last message date (to check when they left)
user_stats = data.groupby('user').agg(
    total_messages=('message', 'count'),
    last_seen=('date', 'max')
).reset_index()

# 3. Filter for 'Big Users' (1000+ messages)
big_users = user_stats[user_stats['total_messages'] >= 1000].copy()

# 4. Filter for users who left in 2025
# We define 'lost' as: Last seen date is within 2025 
# (and implicitly, they haven't been seen since, as 'max' captures the absolute last date)
lost_in_2025 = big_users[big_users['last_seen'].dt.year == 2025].copy()

# 5. Extract the month they were lost
lost_in_2025['lost_month'] = lost_in_2025['last_seen'].dt.to_period('M')

# 6. Group by month to get the list of users lost per month
monthly_losses = lost_in_2025.groupby('lost_month')['user'].apply(list)

# --- Display Results ---
print("--- Big Losses (1000+ msg users) by Month in 2025 ---")
for period in sorted(monthly_losses.index):
    users = monthly_losses[period]
    print(f"\nMonth: {period}")
    print(f"Count: {len(users)}")
    print(f"Users: {users}")

# Optional: If you want a dataframe output
result_df = lost_in_2025.groupby('lost_month').agg(
    lost_user_count=('user', 'count'),
    lost_users_list=('user', list)
)

--- Big Losses (1000+ msg users) by Month in 2025 ---

Month: 2025-01
Count: 3
Users: ['Pyczeg', 'abeKrd', 'astrologer95']

Month: 2025-02
Count: 5
Users: ['Flashet67', 'brainofjfk', 'cukuLuLu41', 'fyredoor4', 'sercanwinchester']

Month: 2025-03
Count: 5
Users: ['adevogado0_0', 'bilaltothefuture', 'flyphoenix777', 'moondog357', 'theprimephilosophy']

Month: 2025-04
Count: 2
Users: ['Abraxas47_', 'winterwolf_Audrey1234']

Month: 2025-05
Count: 3
Users: ['catsspurr', 'rodrigo_20771', 'selcuk_007']

Month: 2025-06
Count: 4
Users: ['KarasuSora7', 'fawzarn004', 'prttymafaka', 'shishoti']

Month: 2025-07
Count: 6
Users: ['K_Noxxx', 'Roonss', 'TripleSingleJames', 'ilahfauzan', 'per8immon', 'shenxir']

Month: 2025-08
Count: 10
Users: ['PurpCodd', 'allgudnamesused12345', 'ertagon_', 'exitsixnine', 'moarmar', 'oxotter', 'tiberiu0s', 'voltzz06', 'yaonbuci', 'ベータ']

Month: 2025-09
Count: 6
Users: ['HansoloSupremo', 'choghatasheleesinxd', 'fcukboii', 'kane_ong', 'man1ac007', 'sisq']

Month: 2025-10

  lost_in_2025['lost_month'] = lost_in_2025['last_seen'].dt.to_period('M')


In [28]:


# 2. Aggregate to find 'First Seen' and Total Messages
user_stats = data.groupby('user').agg(
    total_messages=('message', 'count'),
    first_seen=('date', 'min')  # We look for the EARLIEST date this time
).reset_index()

# 3. Filter for 'Big Users' (1000+ messages total)
big_users = user_stats[user_stats['total_messages'] >= 1000].copy()

# 4. Filter for users who STARTED in 2025
# We define 'winner' as: First seen date is within 2025
winners_2025 = big_users[big_users['first_seen'].dt.year == 2025].copy()

# 5. Extract the month they joined
winners_2025['join_month'] = winners_2025['first_seen'].dt.to_period('M')

# 6. Group by month to get the list of winners per month
monthly_winners = winners_2025.groupby('join_month')['user'].apply(list)

# --- Display Results ---
print("--- New VIPs (Joined in 2025, now have 1000+ msgs) ---")
for period in sorted(monthly_winners.index):
    users = monthly_winners[period]
    print(f"\nMonth: {period}")
    print(f"Count: {len(users)}")
    print(f"Users: {users}")

# Optional: Dataframe view
winners_df = winners_2025.groupby('join_month').agg(
    new_vip_count=('user', 'count'),
    new_vip_list=('user', list)
)

--- New VIPs (Joined in 2025, now have 1000+ msgs) ---

Month: 2025-01
Count: 17
Users: ['ACEiCLE', 'Aten369', 'HoneyKick', 'KarasuSora7', 'W1r3lesss', 'Zodiak_L9', 'adevogado0_0', 'cringeuss', 'damm1jp', 'devilbabymamadrama', 'fawzarn004', 'finskbamse', 'kindheadbanger', 'm4estr3', 'theprimephilosophy', 'winterwolf_Audrey1234', 'yeldon64']

Month: 2025-02
Count: 19
Users: ['ASSpaperman', 'Abraxas47_', 'DOROZEA_LOVER', 'K_Noxxx', 'PepegaKun', 'RELlKT', 'S_Face', 'SchiKen44', 'emantheman', 'flyphoenix777', 'fpv_dron', 'fredmckwacz', 'jvparisotto', 'moarmar', 'nizmo_ttn', 'per8immon', 'softarballtt', 'the_great_lord_walden', 'vladislav1997vb']

Month: 2025-03
Count: 19
Users: ['Adevogado__33', 'BLAS1905', 'Der_Stoppi_', 'Ehas__', 'Foolsworn', 'KRIESEAX', 'Randy2D', 'StunnerGR', 'TheOldSeer', 'crxxpxer_', 'hellrazor_0', 'noJokeee1', 'prttymafaka', 'rafa30___', 'rodrigo_20771', 'teavapiti', 'teneightyp1080p', 'tiberiu0s', 'wundo']

Month: 2025-04
Count: 5
Users: ['609__', 'Fein_BV', 'audre

  winners_2025['join_month'] = winners_2025['first_seen'].dt.to_period('M')
