In [4]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from collections import defaultdict
import numpy as np
import pytz
import re


# Read the list of filenames from the configuration file
with open('../../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = "../../data\\"+file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

data['date'] = pd.to_datetime(data['date'])

def convert_utc_to_cet(df, date_column='date'):
    """
    Convert UTC timestamps to Central European Time (CET/CEST) with proper DST handling
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the date column
    date_column (str): Name of the column containing UTC timestamps
    
    Returns:
    pd.DataFrame: DataFrame with converted timestamps
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df

data = convert_utc_to_cet(data)


In [6]:
def process_stream_stats(df):
    """
    Parses chat data to extract subscription and gifting stats per stream.
    """
    # 1. Filter for system messages where user name appears in message
    # Use copy() to avoid SettingWithCopy warnings later
    subData = df[df.apply(lambda row: str(row['user']).lower() in str(row['message']).lower(), axis=1)].copy()
    
    # 2. Exclude "converted from" messages
    subData = subData[~subData['message'].str.contains(' converted from ', case=False, na=False)]
    
    # --- Initialize stream_stats ---
    # We create a base dataframe with all unique streams found in the subset
    unique_streams = subData['stream'].unique()
    stream_stats = pd.DataFrame(unique_streams, columns=['stream'])

    # --- Gifted Data Logic ---
    giftedData = subData[subData['message'].str.contains(' sub ', case=False, na=False)]
    
    # Calculate unique gifters (This was a print statement in your code)
    # We can add this to the stats if needed, or just calculate it globally
    unique_gifter_count = giftedData['user'].nunique()
    
    # --- Gifter Logic (Batch Gifts) ---
    gifterData = subData[subData['message'].str.contains(' is gifting ', case=False, na=False)].copy()
    
    # helper to safely extract numbers
    def get_gift_amount(x):
        search = re.search(r'gifting\s(\d+)', x)
        return int(search.group(1)) if search else 0
        
    gifterData['gifts'] = gifterData['message'].apply(get_gift_amount)
    gift_per_stream_2 = gifterData.groupby('stream')['gifts'].sum().reset_index(name='gift2')

    # Merge batch gifts
    stream_stats = pd.merge(stream_stats, gift_per_stream_2, on='stream', how='left')
    stream_stats['gift2'] = stream_stats['gift2'].fillna(0)

    # --- Unique Gifters per Stream ---
    unique_users_per_stream = giftedData.groupby('stream')['user'].nunique().reset_index(name='gifter')
    stream_stats = pd.merge(stream_stats, unique_users_per_stream, on='stream', how='left')
    stream_stats['gifter'] = stream_stats['gifter'].fillna(0)

    # --- Gift Count (Individual) ---
    gift_per_stream = giftedData.groupby('stream')['user'].count().reset_index(name='gift')
    stream_stats = pd.merge(stream_stats, gift_per_stream, on='stream', how='left')
    stream_stats['gift'] = stream_stats['gift'].fillna(0)

    # Max logic as per your script
    stream_stats['gift'] = np.maximum(stream_stats['gift'], stream_stats['gift2'])

    # --- Prime Logic ---
    primeData = subData[subData['message'].str.contains('Prime', case=False, na=False)]
    prime_per_stream = primeData.groupby('stream')['user'].count().reset_index(name='prime')
    stream_stats = pd.merge(stream_stats, prime_per_stream, on='stream', how='left')
    stream_stats['prime'] = stream_stats['prime'].fillna(0)

    # --- True Sub (Tier) Logic ---
    trueSub = subData[~subData['message'].str.contains('Prime', case=False, na=False)]
    trueSub = trueSub[~trueSub['message'].str.contains('gifting', case=False, na=False)]
    trueSub = trueSub[~trueSub['message'].str.contains('gifted', case=False, na=False)]
    trueSub = trueSub[trueSub['message'].str.contains(' subscribed at ', case=False, na=False)]

    for i in range(1, 4):
        tier_name = f'Tier {i}'
        col_name = f'tier{i}'
        tier_data = trueSub[trueSub['message'].str.contains(tier_name, case=False, na=False)]
        tier_counts = tier_data.groupby('stream')['user'].count().reset_index(name=col_name)
        
        stream_stats = pd.merge(stream_stats, tier_counts, on='stream', how='left')
        stream_stats[col_name] = stream_stats[col_name].fillna(0)

    return stream_stats, unique_gifter_count

In [8]:
# --- 1. Define the Cleaning Logic ---
replacements = {
    "Banties1g": "banties_x",
    "banties1g": "banties_x",
    "chili_poe": "chili_con_bacon",
    "CHILI_POE": "chili_con_bacon",
    "chili_conbacon": "chili_con_bacon",
    "Wirelesss_": "W1r3lesss",
    "treklul": "trek44_",
    "ttrek_": "trek44_",
    "trek_x": "trek44_",
    "TriplesingleJ": "TripleSingleJames",
    "uuccugr": "uwu_cougar",
    "uuccugr_": "uwu_cougar",
    "StanIV4_": "stan_iv4",
    "Muuskie2": "Muuskie",
    "nishad_more1311": "nishad13",
    "softarballt": "softarr",
    "softarballtt23": "softarr",
    "lajosbarnabas": "lajoss__"
}

# --- 2. Create Cleaned Data ---
clean_data = data.copy()
clean_data["user"] = clean_data["user"].replace(replacements)

# --- 3. Process Both Datasets ---
# (Using the process_stream_stats function defined in the previous step)
print("Processing Original Data...")
stats_orig, _ = process_stream_stats(data)

print("Processing Cleaned Data...")
stats_clean, _ = process_stream_stats(clean_data)

# --- 4. Merge and Compare Tiers/Prime Only ---
# Merge on 'stream' to align the rows
comparison = pd.merge(
    stats_orig[['stream', 'prime', 'tier1', 'tier2', 'tier3']],
    stats_clean[['stream', 'prime', 'tier1', 'tier2', 'tier3']],
    on='stream',
    suffixes=('_orig', '_clean'),
    how='outer'
).fillna(0)

# Calculate Differences (Cleaned - Original)
cols_to_compare = ['prime', 'tier1', 'tier2', 'tier3']

for col in cols_to_compare:
    # Calculate delta
    comparison[f'{col}_diff'] = comparison[f'{col}_clean'] - comparison[f'{col}_orig']

# --- 5. Display Results ---

# A. Total Aggregate Counts (Did we lose or gain total subs?)
print("\n=== TOTAL COUNTS COMPARISON ===")
summary = pd.DataFrame({
    'Metric': cols_to_compare,
    'Original Total': [comparison[f'{c}_orig'].sum() for c in cols_to_compare],
    'Cleaned Total': [comparison[f'{c}_clean'].sum() for c in cols_to_compare],
    'Difference': [comparison[f'{c}_diff'].sum() for c in cols_to_compare]
})
print(summary)

# B. Streams with Discrepancies
# Show only streams where the numbers don't match
diff_mask = (comparison['prime_diff'] != 0) | \
            (comparison['tier1_diff'] != 0) | \
            (comparison['tier2_diff'] != 0) | \
            (comparison['tier3_diff'] != 0)

discrepancies = comparison[diff_mask].copy()

if not discrepancies.empty:
    print(f"\n=== FOUND {len(discrepancies)} STREAMS WITH DATA MISMATCHES ===")
    # Select relevant columns to display
    display_cols = ['stream'] + [f'{c}_diff' for c in cols_to_compare]
    print(discrepancies[display_cols])
else:
    print("\n=== NO DATA LOSS DETECTED ===")
    print("The renaming of users did not affect the subscription counts.")

Processing Original Data...
Processing Cleaned Data...

=== TOTAL COUNTS COMPARISON ===
  Metric  Original Total  Cleaned Total  Difference
0  prime          5302.0         5282.0       -20.0
1  tier1          4182.0         4134.0       -48.0
2  tier2            42.0           42.0         0.0
3  tier3           243.0          231.0       -12.0

=== FOUND 68 STREAMS WITH DATA MISMATCHES ===
     stream  prime_diff  tier1_diff  tier2_diff  tier3_diff
10       10        -1.0         0.0         0.0         0.0
39       39         0.0        -1.0         0.0         0.0
48       48         0.0        -1.0         0.0         0.0
61       61         0.0        -1.0         0.0         0.0
71       71         0.0        -1.0         0.0         0.0
..      ...         ...         ...         ...         ...
498     498         0.0         0.0         0.0        -1.0
504     504         0.0        -1.0         0.0         0.0
505     505         0.0        -1.0         0.0         0.0
507  