In [6]:
import pandas as pd
import re

In [7]:
# Read the list of filenames from the configuration file
with open('../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])


In [8]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
#data["user"] = data["user"].replace("fyodor_m_d1821", "fyredoor4")
data["user"] = data["user"].replace("chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("chili_conbacon", "chili_con_bacon")
data["user"] = data["user"].replace("Wirelesss_", "W1r3lesss")
data["user"] = data["user"].replace("treklul", "trek44_")
data["user"] = data["user"].replace("ttrek_", "trek44_")
data["user"] = data["user"].replace("TriplesingleJ", "TripleSingleJames")
data["user"] = data["user"].replace("uuccugr", "uwu_cougar")

In [9]:
from collections import defaultdict

# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

In [10]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [11]:
# Convert the date column to datetime
data['date'] = pd.to_datetime(data['date'])

# Format the date column to only include the day (YYYY-MM-DD)
data['date'] = data['date'].dt.date

# Create a pivot table
pivot_table = data.pivot_table(
    index='user', 
    columns='date', 
    values='message', 
    aggfunc=lambda x: 1,  # Assign 1 if there's a message
    fill_value=0          # Assign 0 if there's no message
)

# Reset the column names to show as dates instead of a MultiIndex
pivot_table.columns = pivot_table.columns.astype(str)

print(pivot_table)

date                  2024-05-01  2024-05-02  2024-05-03  2024-05-04  \
user                                                                   
0000000emirburak0320           0           0           0           0   
00000021_torresx               0           0           0           0   
00000debe                      0           0           0           0   
00001joel                      0           0           0           0   
0000rade                       0           0           0           0   
...                          ...         ...         ...         ...   
피그킹0814                        0           0           0           0   
하유의튼튼한간기능꼬깔콘                   0           0           0           0   
하이드로른                          0           0           0           0   
핫펨보이                           0           0           0           0   
흑우의도인                          0           0           0           0   

date                  2024-05-05  2024-05-06  2024-05-07  2024-

In [12]:
cumulative_sum = pivot_table.cumsum(axis=1)

In [13]:
cumulative_sum

date,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,2024-05-11,...,2025-05-10,2025-05-11,2025-05-13,2025-05-14,2025-05-15,2025-05-16,2025-05-17,2025-05-18,2025-05-19,2025-05-20
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000000emirburak0320,0,0,0,0,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
00000021_torresx,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
00000debe,0,0,0,0,0,0,0,0,0,0,...,9,9,9,9,9,9,9,10,10,10
00001joel,0,0,0,0,1,2,2,2,2,2,...,3,3,3,3,3,3,3,3,3,3
0000rade,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
피그킹0814,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
하유의튼튼한간기능꼬깔콘,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
하이드로른,0,0,0,0,0,0,0,0,0,0,...,3,3,3,3,3,3,3,3,3,3
핫펨보이,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [14]:
filtered_cumulative_sum = cumulative_sum[cumulative_sum.iloc[:, -1] >= 5]
filtered_cumulative_sum

date,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,2024-05-11,...,2025-05-10,2025-05-11,2025-05-13,2025-05-14,2025-05-15,2025-05-16,2025-05-17,2025-05-18,2025-05-19,2025-05-20
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000000emirburak0320,0,0,0,0,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
00000debe,0,0,0,0,0,0,0,0,0,0,...,9,9,9,9,9,9,9,10,10,10
00yopop,0,0,0,0,0,0,0,0,0,0,...,8,8,8,8,8,8,8,8,8,8
010justwatch,0,0,0,0,0,0,0,0,0,0,...,5,5,6,7,8,8,8,8,8,8
01793937273,0,0,0,0,0,0,0,0,0,0,...,4,4,4,5,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
餅藍,0,0,0,0,0,0,0,0,0,0,...,16,16,16,16,16,16,16,16,16,16
马克斯,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
덕충2,0,0,0,0,0,0,0,0,0,0,...,15,15,15,15,15,15,15,15,15,15
안톤958,0,0,0,0,0,0,0,0,0,0,...,8,8,8,8,8,8,8,8,8,8


In [17]:
filtered_cumulative_sum.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9011 entries, 0000000emirburak0320 to 엘레레레ㅔ
Columns: 323 entries, 2024-05-01 to 2025-05-20
dtypes: int64(323)
memory usage: 22.3+ MB


In [18]:
# --- Step 1 & 2: Calculate rank and identify top 15 users ---

# Calculate rank for each date (column) in descending order.
# The user with the highest cumulative sum gets rank 1.
ranked_df = filtered_cumulative_sum.rank(axis=0, ascending=False, method='min')

# Identify users who were in the top 15 on at least one date.
# This creates a boolean Series where True means the user was in the top 15 at least once.
users_in_top_15_at_least_once = (ranked_df <= 15).any(axis=1)

# Get the list of users (their index labels) who meet the criteria.
users_to_keep = users_in_top_15_at_least_once[users_in_top_15_at_least_once].index

# --- Step 3: Filter the DataFrame ---

# Create a new DataFrame containing only the users who were in the top 15 at least once.
filtered_users_df = filtered_cumulative_sum.loc[users_to_keep]

print("Original DataFrame shape:", filtered_cumulative_sum.shape)
print("Filtered DataFrame shape:", filtered_users_df.shape)

Original DataFrame shape: (9011, 323)
Filtered DataFrame shape: (132, 323)


In [19]:
filtered_users_df.head()

date,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,2024-05-11,...,2025-05-10,2025-05-11,2025-05-13,2025-05-14,2025-05-15,2025-05-16,2025-05-17,2025-05-18,2025-05-19,2025-05-20
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1206paul_,1,2,3,4,5,6,7,8,9,10,...,300,301,302,303,304,305,306,307,308,309
ANewToaster,1,2,3,4,5,6,7,8,8,9,...,24,24,24,24,24,24,24,24,24,24
Aloddin,1,2,3,4,5,6,7,8,9,10,...,278,278,279,280,281,282,283,284,285,286
Aluminiumminimumimmunity,0,0,0,0,0,0,0,0,0,1,...,301,302,303,304,305,306,307,308,309,310
BanjoCash,1,2,3,3,3,3,3,3,3,4,...,65,65,66,66,66,66,66,66,66,66


In [15]:
#filtered_cumulative_sum.to_excel('your_excel_file.xlsx', sheet_name='Pivot Table')