In [12]:
import pandas as pd
import re

# Read the list of filenames from the configuration file
with open('../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])



In [13]:

data.shape

(2275703, 4)

In [14]:
data.shape

(2275703, 4)

In [15]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
data["user"] = data["user"].replace("chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("CHILI_POE", "chili_con_bacon")
data["user"] = data["user"].replace("chili_conbacon", "chili_con_bacon")
data["user"] = data["user"].replace("Wirelesss_", "W1r3lesss")
data["user"] = data["user"].replace("treklul", "trek44_")
data["user"] = data["user"].replace("ttrek_", "trek44_")
data["user"] = data["user"].replace("TriplesingleJ", "TripleSingleJames")
data["user"] = data["user"].replace("uuccugr", "uwu_cougar")
data["user"] = data["user"].replace("uuccugr", "uuccugr_")                                  
data["user"] = data["user"].replace("StanIV4_", "stan_iv4")

In [16]:
from collections import defaultdict

# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

In [17]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [18]:
# Count the number of messages per user
message_counts = data.groupby("user")["message"].count()

# Filter users with 25 or more messages
users_with_25_or_more = message_counts[message_counts >= 25].index

# Filter the original DataFrame to keep only these users
data = data[data["user"].isin(users_with_25_or_more)]

In [19]:
data.shape

(1980156, 4)

In [20]:
from collections import defaultdict

# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

In [21]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [22]:
# Convert date to datetime format
data["date"] = pd.to_datetime(data["date"])



In [23]:
# Truncate datetime to just the day (removing time)
data["day"] = data["date"].dt.date  # Extract only the date part



In [24]:
# Group by 'day' and 'user' and calculate the message count per day per user
data["message_count"] = 1  # Assign 1 for each message to count them
daily_counts = data.groupby(["day", "user"])["message_count"].count().reset_index()



In [25]:
# Pivot the table to create a user-wise table for each day
pivot_table = daily_counts.pivot(index="day", columns="user", values="message_count").fillna(0)


In [26]:

# Add a cumulative sum for each user across the days
cumulative_pivot = pivot_table.cumsum()


In [27]:

# Print the result
print(cumulative_pivot)

user        0000000emirburak0320  00skysea00  00yopop  010justwatch  \
day                                                                   
2024-05-01                   0.0         0.0      0.0           0.0   
2024-05-02                   0.0         0.0      0.0           0.0   
2024-05-03                   0.0         0.0      0.0           0.0   
2024-05-04                   0.0         0.0      0.0           0.0   
2024-05-05                   0.0         0.0      0.0           0.0   
...                          ...         ...      ...           ...   
2025-08-10                  49.0        26.0     55.0          44.0   
2025-08-11                  49.0        26.0     55.0          44.0   
2025-08-12                  49.0        26.0     55.0          44.0   
2025-08-13                  49.0        26.0     55.0          44.0   
2025-08-14                  49.0        26.0     55.0          44.0   

user        01gomesss  06RoB  07moki  09maunomato  0_0andromeda  0arcanjo_  

In [28]:
pivot_data_cleaned_transposed = cumulative_pivot.T
pivot_data_cleaned_transposed = cumulative_pivot.T.reset_index()

In [29]:
pivot_data_cleaned_transposed.head(5)
pivot_data_cleaned_transposed.tail(5)

day,user,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,...,2025-08-05,2025-08-06,2025-08-07,2025-08-08,2025-08-09,2025-08-10,2025-08-11,2025-08-12,2025-08-13,2025-08-14
7528,餅藍,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0
7529,기분조앙,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0
7530,쌍베님사랑합니다,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
7531,안톤958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
7532,엘레레레ㅔ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0


In [30]:
# --- Step 1 & 2: Calculate rank and identify top 15 users ---

# Calculate rank for each date (column) in descending order.
# The user with the highest cumulative sum gets rank 1.
ranked_df = pivot_data_cleaned_transposed.rank(axis=0, ascending=False, method='min')

# Identify users who were in the top 15 on at least one date.
# This creates a boolean Series where True means the user was in the top 15 at least once.
users_in_top_15_at_least_once = (ranked_df <= 15).any(axis=1)

# Get the list of users (their index labels) who meet the criteria.
users_to_keep = users_in_top_15_at_least_once[users_in_top_15_at_least_once].index

# --- Step 3: Filter the DataFrame ---

# Create a new DataFrame containing only the users who were in the top 15 at least once.
filtered_users_df = pivot_data_cleaned_transposed.loc[users_to_keep]

print("Original DataFrame shape:", pivot_data_cleaned_transposed.shape)
print("Filtered DataFrame shape:", filtered_users_df.shape)

Original DataFrame shape: (7533, 395)
Filtered DataFrame shape: (64, 395)


In [31]:
filtered_users_df.head()

day,user,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,...,2025-08-05,2025-08-06,2025-08-07,2025-08-08,2025-08-09,2025-08-10,2025-08-11,2025-08-12,2025-08-13,2025-08-14
33,1206paul_,70.0,152.0,240.0,247.0,340.0,367.0,371.0,391.0,441.0,...,22730.0,22738.0,22770.0,22805.0,22812.0,22813.0,22829.0,22830.0,22835.0,22839.0
55,1SKELTON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0,169.0,...,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0
503,CrazeE420xd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30338.0,30410.0,30498.0,30651.0,30772.0,30772.0,30778.0,30868.0,30927.0,30964.0
618,Dmoz1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,203.0,...,790.0,790.0,790.0,790.0,790.0,790.0,790.0,790.0,790.0,790.0
964,Ice_Saiyan,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,...,498.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0


In [32]:
filtered_users_df.to_excel('your_excel_file.xlsx', sheet_name='Pivot Table')

In [33]:
data.shape

(1980156, 6)