In [32]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'


# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

# Step 1: Group by 'users' and count the messages
user_message_counts = data.groupby('user').size()

# Step 2: Filter users with 100 or more messages
active_users = user_message_counts[user_message_counts >= 1000].index

# Step 3: Filter the original DataFrame to include only these users
filtered_df = data[data['user'].isin(active_users)]

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [34]:
# Step 1: Preprocess the text data
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(filtered_df['message'])

# Step 2: Encode the labels
y = filtered_df['user']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Choose and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.32084974250227205
                          precision    recall  f1-score   support

               1206paul_       0.25      0.21      0.23      1488
                1SKELTON       0.50      0.04      0.08       444
                 Aloddin       0.58      0.15      0.24       356
Aluminiumminimumimmunity       0.41      0.09      0.15       288
               Banties1g       0.26      0.10      0.15      1139
               Casanovaa       0.38      0.03      0.05       297
                 Cuhhsin       0.26      0.03      0.05       271
               Flashet67       0.18      0.05      0.08       229
             IvanOnMyOwn       0.19      0.10      0.13      1455
                Ivana_10       0.60      0.09      0.15       282
                   LX212       0.31      0.07      0.11       494
            Martin_Gales       0.26      0.87      0.41      4270
                Ravenbtw       0.25      0.01      0.02       298
                 Risc__V       0.23      0.18

In [36]:
# New messages to classify
new_messages = [" I love these chocolates with white dots on them when buying lösgodis. Everyone says they're old person candy but idc 😤"]

# Preprocess the messages
X_new = tfidf.transform(new_messages)

# Predict the sender
predictions = model.predict(X_new)

# Output the predictions
for message, prediction in zip(new_messages, predictions):
    print(f"Message: '{message}' was sent by: {prediction}")

Message: ' I love these chocolates with white dots on them when buying lösgodis. Everyone says they're old person candy but idc 😤' was sent by: riesenklotz
