In [64]:
import pandas as pd
import re

# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'


# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

data['date'] = pd.to_datetime(data['date'])

data = data.sort_values(by=['user', 'date'])

#data2 = data[data['user'] == '1206paul_']
#data2 = data[data['user'] == 'balintboss']
#data2 = data[data['user'] == 'klimzaa']
#data2 = data[data['user'] == 'gkey']
#data2 = data[data['user'] == 'Martin_Gales']
#data2 = data[data['user'] == 'IvanOnMyOwn']
data2 = data[data['user'] == 'Trizze']





# Calculate the total number of messages per stream
total_messages_per_stream = data2['stream'].value_counts()

# Combine the results into a single DataFrame
stream_stats = pd.DataFrame({
    'total_messages': total_messages_per_stream
})

# Calculate the time difference between the first and last messages for each stream
watch_lengths = data2.groupby('stream')['date'].agg(lambda x: (x.max() - x.min()).total_seconds())

# Add the stream_length column to the stream_stats DataFrame
stream_stats['watch_length'] = watch_lengths

# Calculate the time difference between the first and last messages for each stream
stream_lengths = data.groupby('stream')['date'].agg(lambda x: (x.max() - x.min()).total_seconds())

# Add the stream_length column to the stream_stats DataFrame
stream_stats['stream_length'] = stream_lengths

stream_stats.head(10)

Unnamed: 0_level_0,total_messages,watch_length,stream_length
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23,140,22607.0,26365.0
58,128,14162.0,17193.0
54,90,5478.0,21630.0
30,85,12504.0,18576.0
39,82,18246.0,20767.0
33,65,19142.0,22676.0
28,60,10267.0,11040.0
5,58,11038.0,20382.0
37,57,12590.0,27145.0
32,57,9495.0,23298.0


In [65]:
stream_stats.describe()

Unnamed: 0,total_messages,watch_length,stream_length
count,41.0,41.0,41.0
mean,35.219512,7023.463415,19529.853659
std,33.142505,6671.746533,4873.69529
min,1.0,0.0,9311.0
25%,10.0,1054.0,17430.0
50%,26.0,5062.0,18576.0
75%,49.0,12504.0,21262.0
max,140.0,22607.0,36105.0


In [66]:
stream_stats['watch_length_in_hour'] = stream_stats['watch_length'] / 60 /60
stream_stats['stream_length_in_hour'] = stream_stats['stream_length'] / 60 /60

stream_stats.head(5)

Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
23,140,22607.0,26365.0,6.279722,7.323611
58,128,14162.0,17193.0,3.933889,4.775833
54,90,5478.0,21630.0,1.521667,6.008333
30,85,12504.0,18576.0,3.473333,5.16
39,82,18246.0,20767.0,5.068333,5.768611


In [67]:
stream_stats['persenteg'] = stream_stats['watch_length'] / stream_stats['stream_length'] * 100

stream_stats.head(5)



Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23,140,22607.0,26365.0,6.279722,7.323611,85.746255
58,128,14162.0,17193.0,3.933889,4.775833,82.370732
54,90,5478.0,21630.0,1.521667,6.008333,25.325936
30,85,12504.0,18576.0,3.473333,5.16,67.312661
39,82,18246.0,20767.0,5.068333,5.768611,87.860548


In [68]:
stream_stats['persenteg'].describe()

count    41.000000
mean     36.295368
std      33.869017
min       0.000000
25%       6.039768
50%      20.127237
75%      67.312661
max      98.687703
Name: persenteg, dtype: float64

In [69]:
stream_stats = stream_stats.sort_values(by='persenteg')

stream_stats.head(10)


Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20,1,0.0,17472.0,0.0,4.853333,0.0
53,1,0.0,17785.0,0.0,4.940278,0.0
17,1,0.0,15531.0,0.0,4.314167,0.0
4,8,115.0,18734.0,0.031944,5.203889,0.613857
0,5,175.0,17697.0,0.048611,4.915833,0.988868
16,10,205.0,18574.0,0.056944,5.159444,1.103693
25,6,655.0,36105.0,0.181944,10.029167,1.814153
13,11,696.0,21235.0,0.193333,5.898611,3.277608
44,13,471.0,9311.0,0.130833,2.586389,5.058533
27,26,1046.0,17430.0,0.290556,4.841667,6.001147
