In [16]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = "data\\"+file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

#convert to date
data['date'] = pd.to_datetime(data['date'])

In [17]:
all_stream = data['stream'].max()
all_stream

132

In [18]:
def user_statistics_fixed(df):
    user_stats = df.groupby('user').agg(
        streams_attended=pd.NamedAgg(column='stream', aggfunc=lambda x: x.nunique()),  # Unique streams
        messages_sent=pd.NamedAgg(column='message', aggfunc='count'),  # Total messages
        characters_sent=pd.NamedAgg(column='message', aggfunc=lambda x: x.str.len().sum()),  # Total characters
        messages_with_at=pd.NamedAgg(column='message', aggfunc=lambda x: x.str.contains('@').sum()),  # Messages with '@'
        longest_message=pd.NamedAgg(column='message', aggfunc=lambda x: x.str.len().max()),  # Longest message length
    ).reset_index()

    # Add rank based on message quantity
    user_stats['message_rank'] = user_stats['messages_sent'].rank(ascending=False, method='min').astype(int)

    user_stats['avg_msg_stream'] = user_stats['messages_sent'] / user_stats['streams_attended']

    return user_stats

# Generate the statistics
user_stats_fixed = user_statistics_fixed(data)

In [19]:
user_stats_fixed.shape

(12854, 8)

In [20]:
user_stats_fixed.sort_values(by=['message_rank']).head(25)

Unnamed: 0,user,streams_attended,messages_sent,characters_sent,messages_with_at,longest_message,message_rank,avg_msg_stream
2132,Martin_Gales,128,25268,635380,3822,478,1,197.40625
3081,StreamElements,133,11737,1422863,4301,466,2,88.24812
7778,klimzaa,117,11593,271287,1608,500,3,99.08547
64,1206paul_,133,11465,307177,1086,500,4,86.203008
6251,fyodor_m_d1821,54,9162,163625,982,485,5,169.666667
1636,IvanOnMyOwn,112,8710,189828,840,500,6,77.767857
4395,balintboss,133,8689,174737,698,490,7,65.330827
3333,Typhu25,90,7727,444809,738,500,8,85.855556
9468,oJov,45,6032,140192,775,279,9,134.044444
10288,riesenklotz,66,5856,132578,286,432,10,88.727273
