In [None]:
# Import necessary libraries
from datetime import timedelta
from datetime import datetime as dt
from functools import reduce
from collections import Counter
from tabulate import tabulate
from IPython.display import HTML, display
from emex import demex
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt;
import regex, operator
%matplotlib inline

In [None]:
# Load data and show first five rows

group = "UI"

data = pd.read_csv(f"data/{group}.csv")
data.head()

In [None]:
# Describe non-numerical columns
data[["sender", "message_type", "forwarded_from", "text", "caption", "photo_resolution", "sticker_emoji"]].describe()

In [None]:
# General info about DataFrame
data.info()

In [None]:
# Members of the group
pd.unique(data[["sender"]].values.ravel('K'))

In [None]:
# All differente message types
pd.unique(data[["message_type"]].values.ravel("K"))

In [None]:
# How many message in each type of message?
data["message_type"].value_counts()

In [None]:
# How many text messages each member has sent?
data.loc[data["message_type"] == "text"]["sender"].value_counts()

In [None]:
# Average message length for each member of the group
data.groupby(["sender"])["text_length"].mean()

In [None]:
# For each member of the group, whom they have most replied to.
members = list(pd.unique(data["sender"].values.ravel("K")))
for member in members:
    reply_to_sender_values = ', '.join(list(data.loc[data["sender"] == member]["reply_to_sender"].value_counts().index))
    print(f"{member}\t→ {reply_to_sender_values}")

In [None]:
# Which member has used the letter و most?
data["text"].apply(lambda x: x.count('و') if type(x) == str else 0)

In [None]:
# How many replies, exluding those on themselves
data.loc[(data["reply_to_id"] != 0) & (data["reply_to_sender"] != data["sender"])]["sender"].value_counts()

In [None]:
# What percentage of messages were replies
replies = data.loc[data["reply_to_id"] != 0]["sender"].value_counts()
all_messages = data["sender"].value_counts()
(replies / all_messages * 100).sort_values(ascending=False)

In [None]:
# Average length of text messages, per member
data.groupby(["sender"])["text_length"].mean().round(2).sort_values(ascending=False).to_frame("average length")

In [None]:
# How many times they've said bekiram
bk_en = [" bk ", "bekiram", "bkirm", "b kirm", "b kiram", "be kiram", "be kirm"]
bk_fa = ["بکیرم", "به کیرم", "ب کیرم"]
bk = bk_en + bk_fa
data.loc[data["text"].str.contains('|'.join(bk), na=False)]["sender"].value_counts()

In [None]:
# Average audio length
sec = int(data.loc[data["message_type"] == "audio"]["media_duration"].mean())
avg_time = str(timedelta(seconds=sec))
avg_time

In [None]:
# Audio stats for {Amir, Hossein, and Shahin}
members = ["Amir", "Hossein", "Shahin"]
for member in members:
    count = len(data.loc[(data["message_type"] == "audio") & (data["sender"] == member)].index)
    sec = int(data.loc[(data["message_type"] == "audio") & (data["sender"] == member)]["media_duration"].mean())
    avg_time = str(timedelta(seconds=sec))
    print(f"{member} sent {count} songs with an average of {avg_time}.")

In [None]:
# Extract {day_of_week, day_of_month, hour_of_day, month_of_year} from message_date and drop message_date
day_of_week = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%w"))
data.insert(loc=6, column="day_of_week", value=day_of_week)

day_of_month = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%d"))
data.insert(loc=7, column="day_of_month", value=day_of_month)

hour_of_day = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%H"))
data.insert(loc=8, column="hour_of_day", value=hour_of_day)

month_of_year = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%m"))
data.insert(loc=9, column="month_of_year", value=month_of_year)

del data["message_date"]
data.head()

In [None]:
# Number of messages per day of week
days = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]
all_df = data["day_of_week"].value_counts()
all_dow = list(zip(*sorted(list(zip(all_df.index, all_df)), key=lambda x: x[0])))

amir_df = data.loc[data["sender"] == "Amir"]["day_of_week"].value_counts()
amir_dow = list(zip(*sorted(list(zip(amir_df.index, amir_df)), key=lambda x: x[0])))

all_y_axis = all_dow[1]
amir_y_axis = amir_dow[1]

plt.figure(dpi=120)
# plt.plot(days, all_y_axis)
plt.plot(days, amir_y_axis)
plt.title('Number of messages per day of week')
plt.show()

In [None]:
# On average, how many messages per day has been sent in the group since the beginning?
first_message = data["message_date"].min()
last_message = data["message_date"].max()
group_age_in_days = round((last_message - first_message) / (24 * 60 * 60))
total_messages = len(data.index)
average_per_day = total_messages/group_age_in_days
print(f"First message: {dt.utcfromtimestamp(first_message).strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Last message: {dt.utcfromtimestamp(last_message).strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Number of messages: {total_messages}")
print(f"On average, {average_per_day:0.2f} messages per day has been sent in the group since the beginning.")

In [None]:
# Most used emojis
def most_used_emojis(member):
    emojis_list_of_list = data.loc[
        (data["sender"] == member) & (data["is_forwarded"] == 0)
    ]["text_emojis"].dropna().apply(lambda x: regex.findall(r"\X", x)).tolist()

    if(len(emojis_list_of_list) > 0):
        emojis_list = reduce(operator.concat, emojis_list_of_list)
        emojis_count = len(emojis_list)
        most_common = Counter(emojis_list).most_common(10)
        emojis = ''.join(list(zip(*most_common))[0])
        return [emojis_count, member, emojis]
    else:
        return [0, ""]

members = list(pd.unique(data["sender"].values.ravel("K")))

emojis = []
for member in members:
    emojis.append(most_used_emojis(member))

reports = [x for x in sorted(emojis, key=lambda x: x[0], reverse=True) if x[0] > 0]

# display(HTML(
#    '<table><tr>{}</tr></table>'.format(
#        '</tr><tr>'.join(
#            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in reports)
#        )
# ))

display(HTML(tabulate(reports, headers=["count", "name", "most used emojis"], tablefmt='html')))
# print(tabulate(reports, headers=["count", "name", "most used emojis"]))

In [None]:
# Average audio length for each member

report = []

def average_audio_length(member):
    media_durations = data.loc[
        (data["sender"] == member) & (data["message_type"] == "audio")
    ]["media_duration"]
    if(not media_durations.empty):
        return int(media_durations.mean())

members = list(pd.unique(data["sender"].values.ravel('K')))
for member in members:
    avg = average_audio_length(member)
    if(avg):
        report.append((member, avg))

media_duration_overall = int(data.loc[data["message_type"] == "audio"]["media_duration"].mean())

report = sorted(report, key=lambda x: x[1])

y, x = list(zip(*report))
y_pos = np.arange(len(y))

plt.figure(dpi=120)

bar = plt.barh(y_pos, list(x), align="center", alpha=0.5, color="b")
plt.yticks(y_pos, y)
plt.axvline(x=media_duration_overall, linewidth=1, color='k')
plt.xlabel("seconds")
plt.title("Average audio length")
plt.show()

In [None]:
# Extract {day_of_week, day_of_month, hour_of_day, month_of_year} from message_date and drop message_date
def extract_and_delete(data):
    location = data.columns.get_loc('message_date')
    day_of_week = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%w"))
    data.insert(loc=location, column="day_of_week", value=day_of_week)

    day_of_month = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%d"))
    data.insert(loc=location+1, column="day_of_month", value=day_of_month)

    hour_of_day = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%H"))
    data.insert(loc=location+2, column="hour_of_day", value=hour_of_day)

    month_of_year = data["message_date"].apply(lambda x: dt.utcfromtimestamp(x).strftime("%m"))
    data.insert(loc=location+3, column="month_of_year", value=month_of_year)
    
    return data.drop(['message_date'], axis=1)

data = extract_and_delete(data)
data.head()

In [None]:
# Number of messages per day of week
days = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")
dow = sorted(dict(data["day_of_week"].value_counts()).items(), key=lambda x: x[0])
x, y = list(zip(*dow))

plt.figure(dpi=120)
plt.plot(y, '-go')
plt.xticks(np.arange(len(days)), days)
plt.title("Number of messages per day of week")
plt.savefig(f"output/{group}/{group} - Number of messages per day of week.png", dpi=500)
plt.show()

In [None]:
# Number of messages per day of month
dom = sorted(dict(data["day_of_month"].value_counts()).items(), key=lambda x: x[0])
x, y = list(zip(*dom))

plt.figure(dpi=120)
plt.plot(y, '-go')
plt.xticks(np.arange(len(x)), x)
plt.title("Number of messages per day of month")

ax = plt.gca()
for label in ax.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)

plt.savefig(f"output/{group}/{group} - Number of messages per day of month.png", dpi=500)
plt.show()

In [None]:
# Number of messages per hour of day
hod = sorted(dict(data["hour_of_day"].value_counts()).items(), key=lambda x: x[0])
x, y = list(zip(*hod))

plt.figure(dpi=120)
plt.plot(y, '-go')
plt.xticks(np.arange(len(x)), x)
plt.title("Number of messages per hour of day")

ax = plt.gca()
for label in ax.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)

plt.savefig(f"output/{group}/{group} - Number of messages per hour of day.png", dpi=500)
plt.show()

In [None]:
# Number of messages per month of year
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
moy = sorted(dict(data["month_of_year"].value_counts()).items(), key=lambda x: x[0])
x, y = list(zip(*moy))

plt.figure(dpi=120)
plt.plot(y, '-go')
plt.xticks(np.arange(len(months)), months)
plt.title("Number of messages per month of year")

plt.savefig(f"output/{group}/{group} - Number of messages per month of year.png", dpi=500)
plt.show()

In [None]:
# Kir, in both languages
kir_en_dict = dict(data.loc[data["text"].str.contains('kir', na=False, case=False)]["sender"].value_counts())
kir_fa_dict = dict(data.loc[data["text"].str.contains('کیر', na=False)]["sender"].value_counts())
members = list(set.union(set(kir_en_dict.keys()), set(kir_fa_dict.keys())))

kir_fa = []
kir_en = []

for member in members:
    kir_fa.append(kir_fa_dict[member]) if member in kir_fa_dict else kir_fa.append(0)
    kir_en.append(kir_en_dict[member]) if member in kir_en_dict else kir_en.append(0)
    
members, kir_en, kir_fa = list(zip(*sorted(zip(members, kir_en, kir_fa), key=lambda x: x[1]+x[2])))

plt.figure(dpi=120, tight_layout=True)

plt.barh(members, kir_fa, label="Persian", color='g', left=kir_en)
plt.barh(members, kir_en, label="English", color='b')

plt.yticks(np.arange(len(members)), members)
plt.legend(loc="lower right")
plt.title("Kir, in both languages")
plt.savefig(f"output/{group}/{group} - Kir, in both languages.png", dpi=500)
plt.show()

In [None]:
# Pie chart of percentage of message for each member of the chat
nom = data["sender"].value_counts() # number of messages
tnom = len(data.index) # total number of messages
message_count = list(zip(nom.index, nom))
mtop = [(x, y) for x, y in message_count if (y/tnom > 0.03)] # those with More Than One Percent messages
mtop.append(("Other", tnom - sum(list(zip(*mtop))[1])))

mtop = sorted(mtop, key=lambda x: x[1], reverse=True)

labels, sizes = list(zip(*mtop))

plt.figure(dpi=120, tight_layout=True)
plt.pie(sizes, labels=labels, autopct="%1.1f%%")
plt.title("Percentage of Messages")
plt.axis("equal")
plt.savefig(f"output/{group}/{group} - Percentage of Messages.png")
plt.show()

In [None]:
hr = data.loc[(data["sender"] == "Hossein") & (data["reply_to_sender"].notnull())]["reply_to_sender"].value_counts() # hossein replies
htr = len(data.loc[(data["sender"] == "Hossein") & (data["reply_to_sender"].notnull())].index) # hossein total replies

message_count = list(zip(hr.index, hr))

mttp = [(x, y) for x, y in message_count if (y/htr > 0.03)] # those with More Than Three Percent messages
mttp.append(("Other", htr - sum(list(zip(*mttp))[1])))

mttp = sorted(mttp, key=lambda x: x[1], reverse=True)

labels, sizes = list(zip(*mttp))

plt.figure(dpi=120, tight_layout=True)
plt.pie(sizes, labels=labels, autopct="%1.1f%%")
plt.title("Hossein's Replies")
plt.axis("equal")
plt.savefig(f"output/{group}/{group} - Hossein's Replies.png")
plt.show()

In [None]:
# Export all text into a file
text = ""
for t in data.loc[data["message_type"] == "text"]["text"]:
    text += f"{demex(t)}\n"

with open(f"output/{group}.txt", 'w', encoding="utf-8") as file:
    file.write(text)