In [18]:
import pandas as pd
import re

# Read the list of filenames from the configuration file
with open('../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])



In [19]:

data.shape

(1681901, 4)

In [20]:
# Count the number of messages per user
message_counts = data.groupby("user")["message"].count()

# Filter users with 25 or more messages
users_with_25_or_more = message_counts[message_counts >= 25].index

# Filter the original DataFrame to keep only these users
data = data[data["user"].isin(users_with_25_or_more)]

In [21]:
data.shape

(1450644, 4)

In [22]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
data["user"] = data["user"].replace("fyodor_m_d1821", "fyredoor4")
data["user"] = data["user"].replace("chili_poe", "chili_con_bacon")
data["user"] = data["user"].replace("CHILI_POE", "chili_con_bacon")
data["user"] = data["user"].replace("Wirelesss_", "W1r3lesss")
data["user"] = data["user"].replace("treklul", "trek44_")
data["user"] = data["user"].replace("ttrek_", "trek44_")
data["user"] = data["user"].replace("TriplesingleJ", "TripleSingleJames")
data["user"] = data["user"].replace("uuccugr", "uwu_cougar")


In [23]:
# Convert date to datetime format
data["date"] = pd.to_datetime(data["date"])



In [24]:
# Truncate datetime to just the day (removing time)
data["day"] = data["date"].dt.date  # Extract only the date part



In [25]:
# Group by 'day' and 'user' and calculate the message count per day per user
data["message_count"] = 1  # Assign 1 for each message to count them
daily_counts = data.groupby(["day", "user"])["message_count"].count().reset_index()



In [26]:
# Pivot the table to create a user-wise table for each day
pivot_table = daily_counts.pivot(index="day", columns="user", values="message_count").fillna(0)


In [27]:

# Add a cumulative sum for each user across the days
cumulative_pivot = pivot_table.cumsum()


In [28]:

# Print the result
print(cumulative_pivot)

user        0000000emirburak0320  00skysea00  00yopop  010justwatch  06RoB  \
day                                                                          
2024-05-01                   0.0         0.0      0.0           0.0    0.0   
2024-05-02                   0.0         0.0      0.0           0.0    0.0   
2024-05-03                   0.0         0.0      0.0           0.0    0.0   
2024-05-04                   0.0         0.0      0.0           0.0    0.0   
2024-05-05                   0.0         0.0      0.0           0.0    0.0   
...                          ...         ...      ...           ...    ...   
2025-04-25                  49.0        26.0     55.0          25.0  241.0   
2025-04-26                  49.0        26.0     55.0          25.0  241.0   
2025-04-27                  49.0        26.0     55.0          25.0  241.0   
2025-04-28                  49.0        26.0     55.0          25.0  241.0   
2025-04-30                  49.0        26.0     55.0          2

In [29]:
pivot_data_cleaned_transposed = cumulative_pivot.T
pivot_data_cleaned_transposed = cumulative_pivot.T.reset_index()

In [30]:
pivot_data_cleaned_transposed.head(5)
pivot_data_cleaned_transposed.tail(5)

day,user,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,...,2025-04-19,2025-04-20,2025-04-21,2025-04-22,2025-04-24,2025-04-25,2025-04-26,2025-04-27,2025-04-28,2025-04-30
5836,金龙,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,266.0,266.0,266.0,267.0,267.0,267.0,267.0,267.0,267.0,267.0
5837,餅藍,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0
5838,쌍베님사랑합니다,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
5839,안톤958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,29.0,29.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
5840,엘레레레ㅔ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0


In [31]:
pivot_data_cleaned_transposed.to_excel('your_excel_file.xlsx', sheet_name='Pivot Table')

In [32]:
data.shape

(1450644, 6)

In [33]:
from difflib import SequenceMatcher

def is_similar(msg1, msg2, threshold=0.6):
    """Check if two messages are similar based on a given threshold."""
    similarity = SequenceMatcher(None, msg1, msg2).ratio()
    return similarity >= threshold

# Sort data by user and date to process messages in chronological order
data = data.sort_values(by=["user", "date"])

# Initialize a list to store indices of non-spam messages
non_spam_indices = []

# Iterate through each user
for user, group in data.groupby("user"):
    last_message = None
    last_date = None
    for idx, row in group.iterrows():
        current_message = row["message"]
        current_date = row["date"]
        
        if last_message is not None and last_date is not None:
            time_diff = (current_date - last_date).total_seconds()
            if time_diff <= 30 and is_similar(last_message, current_message):
                # Skip spam message
                continue
        
        # If not spam, add the index to the list
        non_spam_indices.append(idx)
        last_message = current_message
        last_date = current_date

# Filter the data to keep only non-spam messages
data = data.loc[non_spam_indices].reset_index(drop=True)

data.shape

(1383374, 6)