In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from collections import defaultdict
import numpy as np
import pytz
import re

In [2]:

# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = "data\\"+file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

In [3]:
data['date'] = pd.to_datetime(data['date'])

In [4]:
def convert_utc_to_cet(df, date_column='date'):
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df


In [5]:
data = convert_utc_to_cet(data)

In [6]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
data["user"] = data["user"].replace("chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("CHILI_POE", "chili_con_bacon")
data["user"] = data["user"].replace("Chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("chili_conbacon", "chili_con_bacon")
data["user"] = data["user"].replace("Wirelesss_", "W1r3lesss")
data["user"] = data["user"].replace("treklul", "trek44_")
data["user"] = data["user"].replace("ttrek_", "trek44_")
data["user"] = data["user"].replace("trek_x", "trek44_")
data["user"] = data["user"].replace("TriplesingleJ", "TripleSingleJames")
data["user"] = data["user"].replace("uwu_cougar", "uuccugr")
data["user"] = data["user"].replace("uuccugr_","uuccugr")
data["user"] = data["user"].replace("StanIV4_", "stan_iv4")
data["user"] = data["user"].replace("Muuskie2", "Muuskie")
data["user"] = data["user"].replace("nishad_more1311", "nishad13")
data["user"] = data["user"].replace("softarballt", "softarr")
data["user"] = data["user"].replace("softarballtt23", "softarr")
data["user"] = data["user"].replace("lajosbarnabas", "lajoss__")
data["user"] = data["user"].replace("Bonkwiththefunk", "bonk67")


In [7]:
# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

In [8]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [9]:
# 1. Determine the "First Seen" date for every user
# This tells us when a user originally joined the community
user_birthdays = data.groupby('user')['date'].min().reset_index()
user_birthdays.columns = ['user', 'first_seen']

# Merge this back into the main dataframe
df = data.merge(user_birthdays, on='user')

In [13]:
# ---------------------------------------------------------
# STEP 1: RESET & CLEAN
# ---------------------------------------------------------
# Start fresh from your original 'data' variable to avoid _x/_y errors
df = data.copy()

# Fix the Date format and remove Timezones (Fixes the TypeError)
df['date'] = pd.to_datetime(df['date'])
if df['date'].dt.tz is not None:
    df['date'] = df['date'].dt.tz_localize(None)

# ---------------------------------------------------------
# STEP 2: CALCULATE "FIRST SEEN"
# ---------------------------------------------------------
# Find the earliest date for each user
user_birthdays = df.groupby('user')['date'].min().reset_index()
user_birthdays.columns = ['user', 'first_seen']

# Merge into the main dataframe
df = df.merge(user_birthdays, on='user', how='left')

# ---------------------------------------------------------
# STEP 3: ANALYZE FEB 2026
# ---------------------------------------------------------
feb_start = pd.Timestamp("2026-02-01")
feb_end = pd.Timestamp("2026-03-01")

# Filter for Feb 2026 streams
feb_df = df[(df['date'] >= feb_start) & (df['date'] < feb_end)].copy()

# Get unique users active in Feb
# We use drop_duplicates on 'user' to get 1 row per chatter
active_users = feb_df[['user', 'first_seen']].drop_duplicates()

# ---------------------------------------------------------
# STEP 4: PROVE THE THEORIES
# ---------------------------------------------------------

# Theory 1: "True New" Users (First seen inside Feb 2026)
new_users = active_users[active_users['first_seen'] >= feb_start]

# Theory 2: "Returning" Users (First seen before Feb 2026)
returning_users = active_users[active_users['first_seen'] < feb_start]

print(f"Total Chatters (Feb 2026): {len(active_users)}")
print(f"Truly New Chatters: {len(new_users)}")
print(f"Returning Chatters: {len(returning_users)}")

# Calculate Ratio
new_ratio = len(new_users) / len(active_users) if len(active_users) > 0 else 0
print(f"Percentage New Users: {new_ratio:.2%}")

if new_ratio > 0.5:
    print("\n✅ RESULT: Theory 1 is CORRECT (Mostly new audience).")
else:
    print("\n✅ RESULT: Theory 2 is CORRECT (Mostly old regulars).")

# ---------------------------------------------------------
# STEP 5: DEEP DIVE ON THEORY 2 (Dormancy)
# ---------------------------------------------------------
if len(returning_users) > 0:
    returning_ids = returning_users['user'].unique()
    
    # Look at history only for these specific users BEFORE Feb 2026
    # Note: We use 'df' here to see their full history
    history = df[(df['user'].isin(returning_ids)) & (df['date'] < feb_start)]
    
    if not history.empty:
        # Last time they spoke before Feb
        last_seen = history.groupby('user')['date'].max()
        
        # Calculate gap in days
        gap_days = (feb_start - last_seen).dt.days
        median_gap = gap_days.median()
        
        print(f"\nMedian Days Since Last Chat (for Returning Users): {median_gap:.0f} days")
        
        if median_gap > 30:
            print("Insight: Old users were 'dormant' for a long time. The new time likely woke them up!")
        else:
            print("Insight: Old users were already active recently. They are just hardcore regulars.")

Total Chatters (Feb 2026): 4617
Truly New Chatters: 1224
Returning Chatters: 3393
Percentage New Users: 26.51%

✅ RESULT: Theory 2 is CORRECT (Mostly old regulars).

Median Days Since Last Chat (for Returning Users): 16 days
Insight: Old users were already active recently. They are just hardcore regulars.
