In [83]:
import pandas as pd
import re

# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'


# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

data['date'] = pd.to_datetime(data['date'])

data = data.sort_values(by=['user', 'date'])

#data2 = data[data['user'] == '1206paul_']
#data2 = data[data['user'] == 'balintboss']
#data2 = data[data['user'] == 'klimzaa']
data2 = data[data['user'] == 'gkey']
#data2 = data[data['user'] == 'Martin_Gales']
#data2 = data[data['user'] == 'IvanOnMyOwn']
#data2 = data[data['user'] == 'Trizze']





# Calculate the total number of messages per stream
total_messages_per_stream = data2['stream'].value_counts()

# Combine the results into a single DataFrame
stream_stats = pd.DataFrame({
    'total_messages': total_messages_per_stream
})

# Calculate the time difference between the first and last messages for each stream
watch_lengths = data2.groupby('stream')['date'].agg(lambda x: (x.max() - x.min()).total_seconds())

# Add the stream_length column to the stream_stats DataFrame
stream_stats['watch_length'] = watch_lengths

# Calculate the time difference between the first and last messages for each stream
stream_lengths = data.groupby('stream')['date'].agg(lambda x: (x.max() - x.min()).total_seconds())

# Add the stream_length column to the stream_stats DataFrame
stream_stats['stream_length'] = stream_lengths

stream_stats.head(10)

Unnamed: 0_level_0,total_messages,watch_length,stream_length
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49,37,11691.0,22548.0
25,37,11099.0,36105.0
54,29,17246.0,21630.0
58,28,16473.0,17193.0
9,27,11474.0,16418.0
47,23,15497.0,17769.0
23,22,17128.0,26365.0
4,21,8792.0,18734.0
14,21,998.0,15055.0
24,20,7537.0,20910.0


In [84]:
stream_stats.describe()

Unnamed: 0,total_messages,watch_length,stream_length
count,50.0,50.0,50.0
mean,11.38,5098.36,19030.42
std,8.8822,5416.665032,4838.846591
min,1.0,0.0,8327.0
25%,5.25,341.75,17047.5
50%,7.5,3143.5,18525.5
75%,14.75,8840.0,21205.5
max,37.0,17246.0,36105.0


In [85]:
stream_stats['watch_length_in_hour'] = stream_stats['watch_length'] / 60 /60
stream_stats['stream_length_in_hour'] = stream_stats['stream_length'] / 60 /60

stream_stats.head(5)

Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
49,37,11691.0,22548.0,3.2475,6.263333
25,37,11099.0,36105.0,3.083056,10.029167
54,29,17246.0,21630.0,4.790556,6.008333
58,28,16473.0,17193.0,4.575833,4.775833
9,27,11474.0,16418.0,3.187222,4.560556


In [86]:
stream_stats['persenteg'] = stream_stats['watch_length'] / stream_stats['stream_length'] * 100

stream_stats.head(5)



Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49,37,11691.0,22548.0,3.2475,6.263333,51.849388
25,37,11099.0,36105.0,3.083056,10.029167,30.740895
54,29,17246.0,21630.0,4.790556,6.008333,79.731854
58,28,16473.0,17193.0,4.575833,4.775833,95.812249
9,27,11474.0,16418.0,3.187222,4.560556,69.88671


In [87]:
stream_stats['persenteg'].describe()

count    50.000000
mean     26.920571
std      29.020527
min       0.000000
25%       1.892022
50%      15.831686
75%      44.702874
max      95.812249
Name: persenteg, dtype: float64

In [88]:
stream_stats = stream_stats.sort_values(by='persenteg')

stream_stats.head(10)


Unnamed: 0_level_0,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
stream,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28,1,0.0,11040.0,0.0,3.066667,0.0
34,1,0.0,16999.0,0.0,4.721944,0.0
32,2,20.0,23298.0,0.005556,6.471667,0.085844
12,2,47.0,20477.0,0.013056,5.688056,0.229526
44,4,40.0,9311.0,0.011111,2.586389,0.429599
26,3,92.0,18435.0,0.025556,5.120833,0.499051
27,4,91.0,17430.0,0.025278,4.841667,0.522088
11,4,128.0,20346.0,0.035556,5.651667,0.629116
45,7,130.0,15420.0,0.036111,4.283333,0.843061
46,5,303.0,24561.0,0.084167,6.8225,1.233663


In [89]:
stream_stats['watch_length_in_hour'].sum() /24

2.9504398148148145

In [90]:
stream_stats = stream_stats.reset_index()
stream_stats = stream_stats.sort_values(by='stream')

stream_stats.head(10)

Unnamed: 0,stream,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
41,1,6,9282.0,17831.0,2.578333,4.953056,52.055409
44,2,9,13712.0,19147.0,3.808889,5.318611,71.614352
15,3,12,417.0,17451.0,0.115833,4.8475,2.389548
38,4,21,8792.0,18734.0,2.442222,5.203889,46.930714
24,5,7,3219.0,20382.0,0.894167,5.661667,15.793347
34,6,16,6505.0,16563.0,1.806944,4.600833,39.274286
48,7,6,15885.0,17773.0,4.4125,4.936944,89.377145
33,8,13,6889.0,18686.0,1.913611,5.190556,36.867173
43,9,27,11474.0,16418.0,3.187222,4.560556,69.88671
46,10,7,7096.0,8327.0,1.971111,2.313056,85.216765


In [91]:
stream_stats.tail(10)

Unnamed: 0,stream,total_messages,watch_length,stream_length,watch_length_in_hour,stream_length_in_hour,persenteg
28,48,7,5607.0,25912.0,1.5575,7.197778,21.638623
40,49,37,11691.0,22548.0,3.2475,6.263333,51.849388
36,51,4,5397.0,12221.0,1.499167,3.394722,44.161689
26,53,10,3068.0,17785.0,0.852222,4.940278,17.250492
45,54,29,17246.0,21630.0,4.790556,6.008333,79.731854
17,55,12,593.0,16004.0,0.164722,4.445556,3.705324
49,58,28,16473.0,17193.0,4.575833,4.775833,95.812249
14,59,8,340.0,17242.0,0.094444,4.789444,1.971929
12,62,7,347.0,18477.0,0.096389,5.1325,1.87801
21,66,15,2566.0,23400.0,0.712778,6.5,10.965812
