In [None]:
# Import necessary libraries
import pandas as pd
from datetime import timedelta

In [None]:
# Load data and show first five rows
data = pd.read_csv("data/HBB.csv")
data.head()

In [None]:
# Describe non-numerical columns
data[["sender", "message_type", "forwarded_from", "text", "caption", "photo_resolution", "sticker_emoji"]].describe()

In [None]:
# General info about DataFrame
data.info()

In [None]:
# Members of the group
pd.unique(data[["sender"]].values.ravel('K'))

In [None]:
# All differente message types
pd.unique(data[["message_type"]].values.ravel("K"))

In [None]:
# How many message in each type of message?
data["message_type"].value_counts()

In [None]:
# How many text messages each member has sent?
data.loc[data["message_type"] == "text"]["sender"].value_counts()

In [None]:
# Average message length for each member of the group
data.groupby(["sender"])["text_length"].mean()

In [None]:
# For each member of the group, whom they have most replied to.
members = list(pd.unique(data["sender"].values.ravel("K")))
for member in members:
    reply_to_sender_values = ', '.join(list(data.loc[data["sender"] == member]["reply_to_sender"].value_counts().index))
    print(f"{member}\t→ {reply_to_sender_values}")

In [None]:
# Which member has used the letter و most?
data["text"].apply(lambda x: x.count('و') if type(x) == str else 0)

In [None]:
# How many replies, exluding those on themselves
data.loc[(data["reply_to_id"] != 0) & (data["reply_to_sender"] != data["sender"])]["sender"].value_counts()

In [None]:
# What percentage of messages were replies
replies = data.loc[data["reply_to_id"] != 0]["sender"].value_counts()
all_messages = data["sender"].value_counts()
(replies / all_messages * 100).sort_values(ascending=False)

In [None]:
# Average length of text messages, per member
data.groupby(["sender"])["text_length"].mean().round(2).sort_values(ascending=False).to_frame("average length")

In [None]:
# How many times they've said bekiram
bk_en = [" bk ", "bekiram", "bkirm", "b kirm", "b kiram", "be kiram", "be kirm"]
bk_fa = ["بکیرم", "به کیرم", "ب کیرم"]
bk = bk_en + bk_fa
data.loc[data["text"].str.contains('|'.join(bk), na=False)]["sender"].value_counts()

In [None]:
# Average audio length
sec = int(data.loc[data["message_type"] == "audio"]["media_duration"].mean().round())
str(timedelta(seconds=sec))