In [1]:
import pandas as pd
import re

# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'


# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = "data\\"+file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])



In [2]:

data.shape

(801870, 4)

In [3]:
# Count the number of messages per user
message_counts = data.groupby("user")["message"].count()

# Filter users with 25 or more messages
users_with_25_or_more = message_counts[message_counts >= 25].index

# Filter the original DataFrame to keep only these users
data = data[data["user"].isin(users_with_25_or_more)]



In [4]:
data.shape

(705998, 4)

In [5]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
#banties_x
data["user"] = data["user"].replace("fyodor_m_d1821", "fyredoor4")


In [6]:
# Convert date to datetime format
data["date"] = pd.to_datetime(data["date"])



In [7]:
# Truncate datetime to just the day (removing time)
data["day"] = data["date"].dt.date  # Extract only the date part



In [8]:
# Group by 'day' and 'user' and calculate the message count per day per user
data["message_count"] = 1  # Assign 1 for each message to count them
daily_counts = data.groupby(["day", "user"])["message_count"].count().reset_index()



In [9]:
# Pivot the table to create a user-wise table for each day
pivot_table = daily_counts.pivot(index="day", columns="user", values="message_count").fillna(0)


In [10]:

# Add a cumulative sum for each user across the days
cumulative_pivot = pivot_table.cumsum()


In [11]:

# Print the result
print(cumulative_pivot)

user        0000000emirburak0320  09maunomato  0buddha  0cops0problems  \
day                                                                      
2024-05-01                   0.0          0.0      0.0             0.0   
2024-05-02                   0.0          0.0      0.0             0.0   
2024-05-03                   0.0          0.0      0.0             0.0   
2024-05-04                   0.0          0.0      0.0             0.0   
2024-05-05                   0.0          0.0      0.0             0.0   
...                          ...          ...      ...             ...   
2024-12-28                  49.0         30.0   1571.0           102.0   
2024-12-29                  49.0         30.0   1571.0           102.0   
2024-12-30                  49.0         30.0   1575.0           103.0   
2024-12-31                  49.0         30.0   1575.0           103.0   
2025-01-01                  49.0         30.0   1575.0           103.0   

user        0nlywon  0ohWell  0rib  0

In [12]:
pivot_data_cleaned_transposed = cumulative_pivot.T
pivot_data_cleaned_transposed = cumulative_pivot.T.reset_index()

In [13]:
pivot_data_cleaned_transposed.head(5)
pivot_data_cleaned_transposed.tail(5)

day,user,2024-05-01,2024-05-02,2024-05-03,2024-05-04,2024-05-05,2024-05-06,2024-05-07,2024-05-09,2024-05-10,...,2024-12-22,2024-12-23,2024-12-25,2024-12-26,2024-12-27,2024-12-28,2024-12-29,2024-12-30,2024-12-31,2025-01-01
2810,孙哮喘258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0
2811,木柵林達豐,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,115.0,126.0,126.0,126.0,126.0,128.0,128.0,131.0,144.0,144.0
2812,爪口长工,7.0,15.0,15.0,29.0,29.0,31.0,31.0,86.0,90.0,...,2044.0,2044.0,2044.0,2044.0,2044.0,2049.0,2049.0,2049.0,2049.0,2049.0
2813,쌍베님사랑합니다,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
2814,엘레레레ㅔ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0


In [14]:
pivot_data_cleaned_transposed.to_excel('your_excel_file.xlsx', sheet_name='Pivot Table')