In [None]:
# Import necessary libraries
import pandas as pd
from datetime import timedelta
from datetime import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Load data and show first five rows
data = pd.read_csv("data/SE94.csv")
data.head()

In [None]:
# Describe non-numerical columns
data[["sender", "message_type", "forwarded_from", "text", "caption", "photo_resolution", "sticker_emoji"]].describe()

In [None]:
# General info about DataFrame
data.info()

In [None]:
# Members of the group
pd.unique(data[["sender"]].values.ravel('K'))

In [None]:
# All differente message types
pd.unique(data[["message_type"]].values.ravel("K"))

In [None]:
# How many message in each type of message?
data["message_type"].value_counts()

In [None]:
# How many text messages each member has sent?
data.loc[data["message_type"] == "text"]["sender"].value_counts()

In [None]:
# Average message length for each member of the group
data.groupby(["sender"])["text_length"].mean()

In [None]:
# For each member of the group, whom they have most replied to.
members = list(pd.unique(data["sender"].values.ravel("K")))
for member in members:
    reply_to_sender_values = ', '.join(list(data.loc[data["sender"] == member]["reply_to_sender"].value_counts().index))
    print(f"{member}\t→ {reply_to_sender_values}")

In [None]:
# Which member has used the letter و most?
data["text"].apply(lambda x: x.count('و') if type(x) == str else 0)

In [None]:
# How many replies, exluding those on themselves
data.loc[(data["reply_to_id"] != 0) & (data["reply_to_sender"] != data["sender"])]["sender"].value_counts()

In [None]:
# What percentage of messages were replies
replies = data.loc[data["reply_to_id"] != 0]["sender"].value_counts()
all_messages = data["sender"].value_counts()
(replies / all_messages * 100).sort_values(ascending=False)

In [None]:
# Average length of text messages, per member
data.groupby(["sender"])["text_length"].mean().round(2).sort_values(ascending=False).to_frame("average length")

In [None]:
# How many times they've said bekiram
bk_en = [" bk ", "bekiram", "bkirm", "b kirm", "b kiram", "be kiram", "be kirm"]
bk_fa = ["بکیرم", "به کیرم", "ب کیرم"]
bk = bk_en + bk_fa
data.loc[data["text"].str.contains('|'.join(bk), na=False)]["sender"].value_counts()

In [None]:
# Average audio length
sec = int(data.loc[data["message_type"] == "audio"]["media_duration"].mean())
avg_time = str(timedelta(seconds=sec))
avg_time

In [None]:
# Audio stats for {Amir, Hossein, and Shahin}
members = ["Amir", "Hossein", "Shahin"]
for member in members:
    count = len(data.loc[(data["message_type"] == "audio") & (data["sender"] == member)].index)
    sec = int(data.loc[(data["message_type"] == "audio") & (data["sender"] == member)]["media_duration"].mean())
    avg_time = str(timedelta(seconds=sec))
    print(f"{member} sent {count} songs with an average of {avg_time}.")

In [None]:
# Extract {day_of_week, day_of_month, hour_of_day, month_of_year} from message_date and drop message_date
day_of_week = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%w"))
data.insert(loc=6, column="day_of_week", value=day_of_week)

day_of_month = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%d"))
data.insert(loc=7, column="day_of_month", value=day_of_month)

hour_of_day = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%H"))
data.insert(loc=8, column="hour_of_day", value=hour_of_day)

month_of_year = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%m"))
data.insert(loc=9, column="month_of_year", value=month_of_year)

del data["message_date"]
data.head()

In [None]:
# Number of messages per day of week
days = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]
all_df = data["day_of_week"].value_counts()
all_dow = list(zip(*sorted(list(zip(all_df.index, all_df)), key=lambda x: x[0])))

amir_df = data.loc[data["sender"] == "Amir"]["day_of_week"].value_counts()
amir_dow = list(zip(*sorted(list(zip(amir_df.index, amir_df)), key=lambda x: x[0])))

all_y_axis = all_dow[1]
amir_y_axis = amir_dow[1]

plt.figure(dpi=120)
# plt.plot(days, all_y_axis)
plt.plot(days, amir_y_axis)
plt.title('Number of messages per day of week')
plt.show()

In [None]:
# Number of messages per day of week for all members of the group, including the average
plt.figure(dpi=120)

all_members = pd.unique(data[["sender"]].values.ravel("K"))
all_df = data["day_of_week"].value_counts() / len(all_members)
all_dow = list(zip(*sorted(list(zip(all_df.index, all_df)), key=lambda x: x[0])))
all_y_axis = all_dow[1]
plt.plot(days, all_y_axis, label="average", dashes=(0.6, 5), dash_capstyle='round', color='black')

for member in all_members:
    member_df = data.loc[data["sender"] == member]["day_of_week"].value_counts()
    member_dow = list(zip(*sorted(list(zip(member_df.index, member_df)), key=lambda x: x[0])))
    member_y_axis = member_dow[1]
    if len(member_y_axis) == 7:
        plt.plot(days, member_y_axis, label=member)
    
plt.title('Number of messages per day of week')
plt.legend(loc="upper center", bbox_to_anchor=(0.5, 1), ncol=3, prop={"size": 7})
plt.savefig("Number of messages per day of week - Music and Other Shits.png", dpi=500)
plt.show()

In [None]:
# On average, how many messages per day has been sent in the group since the beginning?
first_message = data["message_date"].min()
last_message = data["message_date"].max()
group_age_in_days = round((last_message - first_message) / (24 * 60 * 60))
total_messages = len(data.index)
average_per_day = total_messages/group_age_in_days
print(f"First message: {dt.utcfromtimestamp(first_message).strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Last message: {dt.utcfromtimestamp(last_message).strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Number of messages: {total_messages}")
print(f"On average, {average_per_day:0.2f} messages per day has been sent in the group since the beginning.")