In [1]:
import pandas as pd
import re

# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'


# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

data['date'] = pd.to_datetime(data['date'])

data = data.sort_values(by=['user', 'date'])

#data2 = data[data['user'] == '1206paul_']
data2 = data[data['user'] == 'balintboss']
#data2 = data[data['user'] == 'klimzaa']
#data2 = data[data['user'] == 'gkey']
#data2 = data[data['user'] == 'Martin_Gales']
#data2 = data[data['user'] == 'IvanOnMyOwn']
#data2 = data[data['user'] == 'Trizze']





# Calculate the total number of messages per stream
total_messages_per_stream = data2['stream'].value_counts()

# Combine the results into a single DataFrame
stream_stats = pd.DataFrame({
    'total_messages': total_messages_per_stream
})

# Calculate the time difference between the first and last messages for each stream
watch_lengths = data2.groupby('stream')['date'].agg(lambda x: (x.max() - x.min()).total_seconds())

# Add the stream_length column to the stream_stats DataFrame
stream_stats['watch_length'] = watch_lengths

# Calculate the time difference between the first and last messages for each stream
stream_lengths = data.groupby('stream')['date'].agg(lambda x: (x.max() - x.min()).total_seconds())

# Add the stream_length column to the stream_stats DataFrame
stream_stats['stream_length'] = stream_lengths

stream_stats.head(10)

Unnamed: 0_level_0,total_messages,watch_length,stream_length
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,105,35147.0,36105.0
49,87,22521.0,22548.0
60,86,18002.0,18233.0
62,84,18447.0,18477.0
19,79,20342.0,21336.0
20,79,17236.0,17472.0
5,76,20013.0,20382.0
18,69,6995.0,7430.0
48,67,25516.0,25912.0
42,65,21074.0,21262.0


In [2]:
stream_stats.describe()

Unnamed: 0,total_messages,watch_length,stream_length
count,64.0,64.0,64.0
mean,42.078125,17771.75,18311.578125
std,21.921436,5456.92629,5452.278524
min,5.0,2861.0,5257.0
25%,25.75,15813.75,16429.25
50%,40.5,17642.5,18334.0
75%,55.0,20717.75,21146.5
max,105.0,35147.0,36105.0


In [3]:
stream_stats['watch_length_in_hour'] = stream_stats['watch_length'] / 60 /60
stream_stats['stream_length_in_hour'] = stream_stats['stream_length'] / 60 /60

stream_stats.head(5)

Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
25,105,35147.0,36105.0,9.763056,10.029167
49,87,22521.0,22548.0,6.255833,6.263333
60,86,18002.0,18233.0,5.000556,5.064722
62,84,18447.0,18477.0,5.124167,5.1325
19,79,20342.0,21336.0,5.650556,5.926667


In [4]:
stream_stats['persenteg'] = stream_stats['watch_length'] / stream_stats['stream_length'] * 100

stream_stats.head(5)



Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25,105,35147.0,36105.0,9.763056,10.029167,97.346628
49,87,22521.0,22548.0,6.255833,6.263333,99.880255
60,86,18002.0,18233.0,5.000556,5.064722,98.733066
62,84,18447.0,18477.0,5.124167,5.1325,99.837636
19,79,20342.0,21336.0,5.650556,5.926667,95.341207


In [5]:
stream_stats['persenteg'].describe()

count    64.000000
mean     96.493387
std       6.446791
min      54.422675
25%      95.945260
50%      98.724045
75%      99.587783
max      99.912993
Name: persenteg, dtype: float64

In [6]:
stream_stats = stream_stats.sort_values(by='persenteg')

stream_stats.head(10)


Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
50,17,2861.0,5257.0,0.794722,1.460278,54.422675
3,22,14924.0,17451.0,4.145556,4.8475,85.519454
37,50,23739.0,27145.0,6.594167,7.540278,87.45257
41,12,6857.0,7832.0,1.904722,2.175556,87.551073
63,37,16325.0,18507.0,4.534722,5.140833,88.209867
56,9,19297.0,21863.0,5.360278,6.073056,88.263276
7,22,15999.0,17773.0,4.444167,4.936944,90.018567
22,13,11429.0,12616.0,3.174722,3.504444,90.591313
4,23,17458.0,18734.0,4.849444,5.203889,93.188854
13,50,19916.0,21235.0,5.532222,5.898611,93.788557


In [9]:
stream_stats['watch_length_in_hour'].sum() /24

13.164259259259259