In [4]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import pytz


# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = "data\\"+file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                #user = user.lower() #try with this test
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

#date to date format and sorting with it

data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(by=['user', 'date'])


In [5]:
def convert_utc_to_cet(df, date_column='date'):
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df


In [6]:
data = convert_utc_to_cet(data)

In [7]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
data["user"] = data["user"].replace("chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("CHILI_POE", "chili_con_bacon")
data["user"] = data["user"].replace("chili_conbacon", "chili_con_bacon")
data["user"] = data["user"].replace("Wirelesss_", "W1r3lesss")
data["user"] = data["user"].replace("treklul", "trek44_")
data["user"] = data["user"].replace("ttrek_", "trek44_")
data["user"] = data["user"].replace("TriplesingleJ", "TripleSingleJames")
data["user"] = data["user"].replace("uuccugr", "uwu_cougar")
data["user"] = data["user"].replace("uuccugr", "uuccugr_")


In [8]:
from collections import defaultdict

# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

# Display the results
for lower, variants in duplicate_users.items():
    print(f"{lower}: {sorted(variants)}")

1flusha: ['1Flusha', '1flusha']
1mayas_ggz: ['1MAYAS_GGZ', '1MAYAS_GGz']
1tapelite_yugen: ['1TapElite_YUGEN', '1TapElite_YugeN']
31lamemonster: ['31LameMonster', '31lamemonster']
4widzewlodz: ['4Widzewlodz', '4widzewlodz']
4lreal: ['4lReal', '4lreal']
7atc: ['7ATC', '7atc']
7miyoshi7: ['7MIYOSHI7', '7miyoshi7']
ahmadov_g: ['AHMADOV_G', 'ahmadov_g']
al0ushii: ['AL0USHII', 'al0ushii']
anxlwave: ['ANXLwave', 'anxlwave']
ars_rugal: ['ARS_RUGAL', 'ars_rugal']
asyousayy: ['ASYOUSAYY', 'asyousayy']
abdulbasit3454: ['Abdulbasit3454', 'abdulbasit3454']
abdullahoneui: ['Abdullahoneui', 'abdullahoneui']
adam0f: ['Adam0f', 'adam0f']
adel5244: ['Adel5244', 'adel5244']
aili_tv: ['Aili_TV', 'Aili_tv', 'aili_tv']
aireea: ['AireeA', 'aireea']
alsobres2: ['AlSobreS2', 'alsobreS2']
alphaslanjr: ['AlphaslanJr', 'alphaslanjr']
amirhaid: ['AmirHaid', 'amirhaid']
anbugandhipoovai: ['AnbuGandhiPoovai', 'anbugandhipoovai']
anos87w: ['Anos87w', 'anos87w']
antidotdota_two: ['AntidotDota_Two', 'antidotdota_two']


In [9]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [11]:
# create month column
data['month'] = data['date'].dt.to_period('M')

# first message date per user
first_message = data.groupby('user')['date'].min().reset_index()
first_message['month'] = first_message['date'].dt.to_period('M')

# total messages per user
total_messages = data.groupby('user').size().reset_index(name='total_messages')

# merge to know first month and total messages for each user
new_users = pd.merge(first_message, total_messages, on='user', how='left')

# list to store each month summary
months_summary = []

for month, month_df in data.groupby('month'):
    unique_users = month_df['user'].nunique()
    
    # find new users for this month
    new_users_this_month = new_users[new_users['month'] == month]
    num_new_users = new_users_this_month.shape[0]
    
    # new users who had >1000 messages overall
    big_new_users_df = new_users_this_month[new_users_this_month['total_messages'] > 1000]
    num_big_new_users = big_new_users_df.shape[0]
    
    # get their usernames as a list
    big_new_users_list = big_new_users_df['user'].tolist()
    
    months_summary.append({
        'month': str(month),
        'unique_users': unique_users,
        'new_users': num_new_users,
        'new_users_over_1000': num_big_new_users,
        'big_new_users_list': big_new_users_list
    })

# convert to dataframe
monthsdata = pd.DataFrame(months_summary)

# preview
print(monthsdata)

  data['month'] = data['date'].dt.to_period('M')
  first_message['month'] = first_message['date'].dt.to_period('M')


      month  unique_users  new_users  new_users_over_1000  \
0   2024-05          2266       2266                   57   
1   2024-06          2441       1704                    7   
2   2024-07          3571       2474                   10   
3   2024-08          3862       2472                   16   
4   2024-09          4937       3011                   12   
5   2024-10          5529       3306                   16   
6   2024-11          6596       4003                    8   
7   2024-12          8201       4857                   11   
8   2025-01         10360       6909                   17   
9   2025-02         19263      14453                   16   
10  2025-03         15812       9072                   14   
11  2025-04         10774       5233                    2   
12  2025-05          9897       4580                    3   
13  2025-06          9249       4453                    4   
14  2025-07          5226       2079                    0   

                       

In [12]:
for i, row in monthsdata.iterrows():
    print(f"Month: {row['month']}")
    print("Big new users over 1000 messages:")
    print(row['big_new_users_list'])
    print("------")


Month: 2024-05
Big new users over 1000 messages:
['1206paul_', '1SKELTON', '1rabo', '55Dash', 'Aloddin', 'Aluminiumminimumimmunity', 'AnAnonymousGifter', 'Cuhhsin', 'HansoloSupremo', 'HippoBippo_', 'Hueqi', 'IvanOnMyOwn', 'Ivana_10', 'LX212', 'Martin_Gales', 'Nightbot', 'Ravenbtw', 'Risc__V', 'Roonss', 'SeeMeNoMore_', 'StreamElements', 'Trizze', 'Typhu25', 'Wii_Gii', 'Zeololz', 'abeKrd', 'amirmasoud_2018', 'andrea4V', 'balintboss', 'banties_x', 'basbinga', 'bilaltothefuture', 'blakbilt', 'blind_dood', 'chili_con_bacon', 'converx', 'cookie_tf', 'darxtorn', 'dima_wallhacks', 'enseje', 'gkey', 'guiseppe_mundi', 'hanepirukas1', 'icep1cke', 'kepplerrino', 'klimzaa', 'lajosbarnabas', 'nzexy', 'oJov', 'peroni_nasturo_azuro', 'roxa0', 'sekasekas', 'thebigdogjay', 'whiteknightnipples', 'ylb_cat_soul22', 'ベータ', '爪口长工']
------
Month: 2024-06
Big new users over 1000 messages:
['Footstep_', 'Greksallad', 'Mantiss94', 'TripleSingleJames', 'fuzok4', 'sisiliann', 'stanny781']
------
Month: 2024-07
Big