In [18]:
import pandas as pd
import re
import pytz

# Read the list of filenames from the configuration file
with open('../file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'

# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = f"../data/{file}"
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])
data['date'] = pd.to_datetime(data['date'])

def convert_utc_to_cet(df, date_column='date'):
    """
    Convert UTC timestamps to Central European Time (CET/CEST) with proper DST handling
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the date column
    date_column (str): Name of the column containing UTC timestamps
    
    Returns:
    pd.DataFrame: DataFrame with converted timestamps
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Ensure timestamps are UTC aware
    if df[date_column].dt.tz is None:
        df[date_column] = df[date_column].dt.tz_localize('UTC')
    elif df[date_column].dt.tz != pytz.UTC:
        df[date_column] = df[date_column].dt.tz_convert('UTC')
    
    # Convert to CET/CEST (Europe/Berlin includes proper DST handling)
    df[date_column] = df[date_column].dt.tz_convert('Europe/Berlin')
    
    return df

data = convert_utc_to_cet(data)

In [19]:
data["user"] = data["user"].replace("Banties1g", "banties_x")
data["user"] = data["user"].replace("banties1g", "banties_x")
data["user"] = data["user"].replace("fyodor_m_d1821", "fyredoor4")

In [20]:
message_counts = data["user"].value_counts()
users_above_1k = (message_counts >= 250).sum()
print(users_above_1k)


529


In [21]:
message_counts.describe()

count    50483.000000
mean        26.518610
std        446.612109
min          1.000000
25%          1.000000
50%          3.000000
75%          7.000000
max      41207.000000
Name: count, dtype: float64

In [22]:
userName='W1r3lesss'

In [23]:
# Filter messages from Fossabot
fossa_warnings = data[data["user"] == "Fossabot"]

# Extract warned usernames from the warning message
fossa_warnings["warned_user"] = fossa_warnings["message"].str.extract(r'@(\w+)')

# 1. Find people who received multiple warnings
warn_counts = fossa_warnings["warned_user"].value_counts()
multiple_warned_users = warn_counts[warn_counts > 1].index.tolist()

# 2. Find users who were warned but never typed in chat
all_warned_users = set(fossa_warnings["warned_user"].dropna())
active_users = set(data["user"]) - {"Fossabot"}  # All users except Fossabot
silent_warned_users = list(all_warned_users - active_users)

# 3. List of all warned people and its length
all_warned_list = list(all_warned_users)
num_warned = len(all_warned_list)

# Print results
print("Users warned multiple times count:", len(multiple_warned_users))
print("Users warned but never typed Count:", len(silent_warned_users))
print("Total warned:", num_warned)

# Find users who were warned multiple times AND never typed
silent_multiple_warned_users = list(set(multiple_warned_users) & set(silent_warned_users))
silent_multiple_warned_count = len(silent_multiple_warned_users)
print("Users warned multiple times AND never typed Count:", silent_multiple_warned_count)


Users warned multiple times count: 498
Users warned but never typed Count: 1130
Total warned: 4074
Users warned multiple times AND never typed Count: 75


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
# Get top 10 most warned users from multiple_warned_users
top_10_warned_users = warn_counts.head(10)  # Select the first 10 from the sorted warning count

# Print results
print("Top 10 most warned users:\n", top_10_warned_users)


Top 10 most warned users:
 warned_user
uuccugr           16
barisbalsuzenn    15
dodo456a          13
ttrek_            13
Leftybrasco        9
maxxus0            7
riesenklotz        7
ertagon_           6
tomasvercetti_     6
regnaltbatu        6
Name: count, dtype: int64


In [25]:
import pandas as pd
from collections import Counter

def calculate_user_stats(df):
    """Calculates various statistics for each user."""
    user_stats = []
    
    for user, user_df in df.groupby("user"):
        messages = user_df["message"].tolist()
        total_messages = len(messages)
        total_words = sum(len(msg.split()) for msg in messages)
        most_common_word = Counter(" ".join(messages).split()).most_common(1)
        
        user_stats.append({
            "user": user,
            "total_messages": total_messages,
            "total_words": total_words,
            "most_common_word": most_common_word[0][0] if most_common_word else None,
            "most_common_word_count": most_common_word[0][1] if most_common_word else 0,
        })
    
    return pd.DataFrame(user_stats)

# Sample DataFrame
# data = pd.DataFrame(datalist, columns=["date", "user", "message", "stream"])

def filter_active_users(data):
    """Filters users with more than 999 messages."""
    user_counts = data["user"].value_counts()
    active_users = user_counts[user_counts > 499].index
    return data[data["user"].isin(active_users)]

# Filtering data and calculating stats
data_filtered = filter_active_users(data)
user_stats_df = calculate_user_stats(data_filtered)



In [26]:
# Example output
user_stats_df.describe()

Unnamed: 0,total_messages,total_words,most_common_word_count
count,264.0,264.0,264.0
mean,2918.560606,13993.602273,1385.25
std,5443.004793,35062.828607,8653.659274
min,500.0,784.0,25.0
25%,677.5,2711.75,93.0
50%,1053.0,4217.0,165.0
75%,2198.75,9757.0,462.0
max,41207.0,340933.0,136660.0


In [27]:
# Assuming user_stats_df is your DataFrame
unique_words = user_stats_df['most_common_word'].unique()

# If you want to print them
print(unique_words)

['xqc32' 'the' 'ome44' 'is' 'you' 'speed1' 'FEET' 'FEIN' '1' 'NOWAYING'
 'ome99' 'omE' 'PepeLaugh' 'I' 'English' 'LUL' 'lul' 'druskiDance' 'a'
 'LOL' 'i' 'loeyaKEKW' 'LULW' 'RaveTime' 'ReallyMad' 'Listening'
 '@dorozea' '?' 'mhm' 'u' 'bye' 'OMEGALUL' 'andrea540Joy' 'hi' 'lol'
 'doro' 'ICANT' 'BangbooBounce' 'DORO' 'Yay' 'PewPewPew'
 'dorozeaTouchingyou' 'forsenLaughingAtYou' 'YEP' 'Clap' 'ome808'
 'STREAMER' 'DinoDance' 'xd' 'LETSGO' 'WW' 'forsenE' '!' 'dorozeaSlam'
 'moustache' 'om' 'Joel' 'qq' 'RalpherZ']
