In [35]:
import pandas as pd
import matplotlib.pyplot as plt
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

In [2]:
data_path = '../../data/main_dataset/main_dataset.csv'

In [3]:
df = pd.read_csv(data_path)

# General dataset statistics

In [None]:
print("DATASET STATISTICS")
print(f"{len(df)} messages")

messages_in_chats = df["chat_handle"].value_counts()
print(f"From {len(messages_in_chats)} chats")
print(f"With a mean of {messages_in_chats.mean()} messages per chat")
print(f"And a median of {messages_in_chats.median()} messages per chat")
print(f"Max messages in a chat: {messages_in_chats.max()}")

print()

forwarded = df[df["is_fwd"] == True]
print(f"Number of forwarded messages: {len(forwarded)} ({len(forwarded) / len(df) * 100:.2f}%)")

# Distribution of messages over time

In [None]:
df['message_date'] = pd.to_datetime(df['message_date'])
daily_messages = df.set_index('message_date').resample('D').size()
relevant_daily_messages = daily_messages['2024-09-19':'2024-09-28']
relevant_daily_messages.plot(kind='bar', figsize=(12, 6))

ax = relevant_daily_messages.plot(kind='bar', figsize=(12, 6))
ax.set_xlabel("Date")
ax.set_ylabel("Message Count")
ax.set_xticklabels([d.strftime('%m-%d') for d in relevant_daily_messages.index], rotation=45, ha='right')
plt.show()

# Language distribution

In [37]:
def detect_row_lang(text):
  if pd.isna(text):
    return None
  
  text = str(text)

  try:
    return detect(text)
  except LangDetectException:
    return None

df["language"] = df["message_text"].apply(lambda x: detect_row_lang(x))

In [None]:
other_cutoff = 500

lang_counts = df["language"].value_counts(dropna=False)
other_count = lang_counts[lang_counts < 500].sum()
lang_counts = lang_counts[lang_counts >=500]
lang_counts["other"] = other_count
lang_counts.rename({
  None: "unclear/mixed",
  "en": "english",
  "de": "german",
  "ru": "russian",
  "it": "italian",
  }, inplace=True)

display(lang_counts)
ax = lang_counts.plot(kind='pie', figsize=(12, 6))
ax.set_ylabel("")