In [1]:
import pandas as pd
import re

# Read the list of filenames from the configuration file
with open('../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])



In [2]:

data.shape

(1804048, 4)

In [3]:
# Count the number of messages per user
message_counts = data.groupby("user")["message"].count()

# Filter users with 25 or more messages
users_with_25_or_more = message_counts[message_counts >= 25].index

# Filter the original DataFrame to keep only these users
data = data[data["user"].isin(users_with_25_or_more)]

In [4]:
data.shape

(1558883, 4)

In [5]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
#data["user"] = data["user"].replace("fyodor_m_d1821", "fyredoor4")
data["user"] = data["user"].replace("chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("chili_conbacon", "chili_con_bacon")
data["user"] = data["user"].replace("Wirelesss_", "W1r3lesss")
data["user"] = data["user"].replace("treklul", "trek44_")
data["user"] = data["user"].replace("ttrek_", "trek44_")
data["user"] = data["user"].replace("TriplesingleJ", "TripleSingleJames")
data["user"] = data["user"].replace("uuccugr", "uwu_cougar")

In [6]:
from collections import defaultdict

# Get all unique usernames
unique_users = data['user'].unique()

# Create a mapping from lowercase username to all variants

user_variants = defaultdict(set)
for user in unique_users:
    user_variants[user.lower()].add(user)

# Find usernames with different capitalization
duplicate_users = {k: v for k, v in user_variants.items() if len(v) > 1}

In [7]:
# Create a mapping from all variants to the canonical (sorted first) variant
variant_map = {}
for variants in duplicate_users.values():
    sorted_variants = sorted(variants)
    canonical = sorted_variants[0]
    for v in variants:
        variant_map[v] = canonical

# Replace usernames in 'user' column
data['user'] = data['user'].apply(lambda u: variant_map.get(u, u))

In [8]:
# Convert date to datetime format
data["date"] = pd.to_datetime(data["date"])



In [9]:
# Truncate datetime to just the day (removing time)
data["day"] = data["date"].dt.date  # Extract only the date part



In [10]:
# Group by 'day' and 'user' and calculate the message count per day per user
data["message_count"] = 1  # Assign 1 for each message to count them
daily_counts = data.groupby(["day", "user"])["message_count"].count().reset_index()



In [11]:
# Pivot the table to create a user-wise table for each day
pivot_table = daily_counts.pivot(index="day", columns="user", values="message_count").fillna(0)


In [12]:

# Add a cumulative sum for each user across the days
cumulative_pivot = pivot_table.cumsum()


In [13]:

# Print the result
print(cumulative_pivot)

user        0000000emirburak0320  00skysea00  00yopop  010justwatch  06RoB  \
day                                                                          
2024-05-01                   0.0         0.0      0.0           0.0    0.0   
2024-05-02                   0.0         0.0      0.0           0.0    0.0   
2024-05-03                   0.0         0.0      0.0           0.0    0.0   
2024-05-04                   0.0         0.0      0.0           0.0    0.0   
2024-05-05                   0.0         0.0      0.0           0.0    0.0   
...                          ...         ...      ...           ...    ...   
2025-05-16                  49.0        26.0     55.0          41.0  242.0   
2025-05-17                  49.0        26.0     55.0          41.0  242.0   
2025-05-18                  49.0        26.0     55.0          41.0  242.0   
2025-05-19                  49.0        26.0     55.0          41.0  242.0   
2025-05-20                  49.0        26.0     55.0          4

In [14]:
pivot_data_cleaned_transposed = cumulative_pivot.T
pivot_data_cleaned_transposed = cumulative_pivot.T.reset_index()

In [15]:
pivot_data_cleaned_transposed.head(5)
pivot_data_cleaned_transposed.tail(5)

day,user,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,...,2025-05-10,2025-05-11,2025-05-13,2025-05-14,2025-05-15,2025-05-16,2025-05-17,2025-05-18,2025-05-19,2025-05-20
6076,金龙,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,270.0,270.0,270.0,270.0,270.0,270.0,272.0,272.0,272.0,272.0
6077,餅藍,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0
6078,쌍베님사랑합니다,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
6079,안톤958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
6080,엘레레레ㅔ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0


In [16]:
# --- Step 1 & 2: Calculate rank and identify top 15 users ---

# Calculate rank for each date (column) in descending order.
# The user with the highest cumulative sum gets rank 1.
ranked_df = pivot_data_cleaned_transposed.rank(axis=0, ascending=False, method='min')

# Identify users who were in the top 15 on at least one date.
# This creates a boolean Series where True means the user was in the top 15 at least once.
users_in_top_15_at_least_once = (ranked_df <= 15).any(axis=1)

# Get the list of users (their index labels) who meet the criteria.
users_to_keep = users_in_top_15_at_least_once[users_in_top_15_at_least_once].index

# --- Step 3: Filter the DataFrame ---

# Create a new DataFrame containing only the users who were in the top 15 at least once.
filtered_users_df = pivot_data_cleaned_transposed.loc[users_to_keep]

print("Original DataFrame shape:", pivot_data_cleaned_transposed.shape)
print("Filtered DataFrame shape:", filtered_users_df.shape)

Original DataFrame shape: (6081, 324)
Filtered DataFrame shape: (63, 324)


In [20]:
filtered_users_df.head()

day,user,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,...,2025-05-10,2025-05-11,2025-05-13,2025-05-14,2025-05-15,2025-05-16,2025-05-17,2025-05-18,2025-05-19,2025-05-20
25,1206paul_,70.0,152.0,240.0,247.0,340.0,367.0,371.0,391.0,441.0,...,21672.0,21679.0,21692.0,21696.0,21730.0,21734.0,21735.0,21789.0,21790.0,21857.0
46,1SKELTON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0,169.0,...,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0,2955.0
397,CrazeE420xd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26545.0,26607.0,26826.0,26885.0,26899.0,26908.0,26908.0,26908.0,27119.0,27227.0
484,Dmoz1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,203.0,...,789.0,789.0,789.0,789.0,789.0,789.0,790.0,790.0,790.0,790.0
756,Ice_Saiyan,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,...,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0


In [21]:
filtered_users_df.to_excel('your_excel_file.xlsx', sheet_name='Pivot Table')

In [18]:
data.shape

(1558883, 6)

In [19]:
"""
from difflib import SequenceMatcher

def is_similar(msg1, msg2, threshold=0.6):
    #Check if two messages are similar based on a given threshold.#
    similarity = SequenceMatcher(None, msg1, msg2).ratio()
    return similarity >= threshold

# Sort data by user and date to process messages in chronological order
data = data.sort_values(by=["user", "date"])

# Initialize a list to store indices of non-spam messages
non_spam_indices = []

# Iterate through each user
for user, group in data.groupby("user"):
    last_message = None
    last_date = None
    for idx, row in group.iterrows():
        current_message = row["message"]
        current_date = row["date"]
        
        if last_message is not None and last_date is not None:
            time_diff = (current_date - last_date).total_seconds()
            if time_diff <= 30 and is_similar(last_message, current_message):
                # Skip spam message
                continue
        
        # If not spam, add the index to the list
        non_spam_indices.append(idx)
        last_message = current_message
        last_date = current_date

# Filter the data to keep only non-spam messages
data = data.loc[non_spam_indices].reset_index(drop=True)

data.shape
"""

'\nfrom difflib import SequenceMatcher\n\ndef is_similar(msg1, msg2, threshold=0.6):\n    #Check if two messages are similar based on a given threshold.#\n    similarity = SequenceMatcher(None, msg1, msg2).ratio()\n    return similarity >= threshold\n\n# Sort data by user and date to process messages in chronological order\ndata = data.sort_values(by=["user", "date"])\n\n# Initialize a list to store indices of non-spam messages\nnon_spam_indices = []\n\n# Iterate through each user\nfor user, group in data.groupby("user"):\n    last_message = None\n    last_date = None\n    for idx, row in group.iterrows():\n        current_message = row["message"]\n        current_date = row["date"]\n        \n        if last_message is not None and last_date is not None:\n            time_diff = (current_date - last_date).total_seconds()\n            if time_diff <= 30 and is_similar(last_message, current_message):\n                # Skip spam message\n                continue\n        \n        # If 