In [None]:
import datetime
from typing import Mapping
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from utils import load_data
from visualizers.breakdown import render_barplot

In [None]:
class Args(Mapping):
    def __init__(
        self,
        platforms = ["telegram_json"],
        start_date = None,
        filter_conversation = [],
        filter_sender = [],
        remove_conversation = [],
        remove_sender = [],
        outgoing_only = False,
        incoming_only = False,
        lang = [],
        contains_keyword = [],
        as_density = False,
        by_words = False,
        top_n = 10,
        bin_size = "1M",
        include_others = False,
    ):
        self.platforms = platforms
        self.start_date = start_date
        self.filter_conversation = filter_conversation
        self.filter_sender = filter_sender
        self.remove_conversation = remove_conversation
        self.remove_sender = remove_sender
        self.outgoing_only = outgoing_only
        self.incoming_only = incoming_only
        self.lang = lang
        self.contains_keyword = contains_keyword
        self.as_density = as_density
        self.by_words = by_words
        self.top_n = top_n
        self.bin_size = bin_size
        self.include_others = include_others

    def __getitem__(self, key):
        return getattr(self, key)

    def __iter__(self):
        return iter(self.__dict__)

    def __len__(self):
        return len(self.__dict__)



In [None]:
def pltfy(ax, title=None, x_label=None, y_label=None):
    sns.set(rc={'figure.figsize':(15, 7)})
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    if title:
        ax.set_title(title)
    if x_label:
        ax.set_xlabel(x_label)
    if y_label:
        ax.set_ylabel(y_label)
    return ax

In [None]:
args = Args(top_n=20, bin_size="1M", by_words=True, include_others=True)
df = load_data(args)
fig = render_barplot(df, args)
fig.suptitle("Total words by chat (all time)")
plt.show()

In [None]:
args = Args(top_n=10, bin_size="1W", start_date=str(datetime.date.today() - datetime.timedelta(days=90)), by_words=True, include_others=True)
df = load_data(args)
fig = render_barplot(df, args)
fig.suptitle("Total words by chat (90 days)")
plt.show()

In [None]:
def autopct_format(values):
    def my_format(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{:.1f}% ({v:d})'.format(pct, v=val)
    return my_format

counts = df["conversationWithName"].value_counts()
counts.plot(kind="pie", autopct=autopct_format(counts), figsize=(8, 8), title="Total messages by chat (90 days)", ylabel="")

In [None]:
args = Args(top_n=25, include_others=True, by_words=False)
df = load_data(args)
df["word_count"] = df["text"].apply(lambda s: len(s.split()))
df["timestamp"] = pd.to_datetime(df.timestamp, unit="s")
df.head()

In [None]:
order = df["conversationWithName"].unique()
order.sort()

In [None]:
pltfy(
    sns.countplot(x="conversationWithName", data=df, order=df["conversationWithName"].value_counts().index),
    title="Total messages by chat", x_label="Chat", y_label="Messages"
)

In [None]:
pltfy(
    sns.barplot(x="conversationWithName", y="word_count", data=df, order=order),
    title="Average words per message by chat",
    x_label="Chat", y_label="Words per message"
)

In [None]:
pltfy(
    sns.barplot(x="conversationWithName", y="word_count", hue="outgoing", data=df, order=order),
    title="Average words per message by chat (split by incoming/ outgoing)",
    x_label="Chat", y_label="Words per message"
)

In [None]:
df_balance = df.copy()
df_balance = df_balance.groupby(["conversationWithName", "outgoing"]).count().reset_index().pivot(index="conversationWithName", columns="outgoing", values="text")
df_balance = (df_balance[True] - df_balance[False]) / (df_balance[True] + df_balance[False]) * 100
pltfy(
    sns.barplot(x=df_balance.index, y=df_balance.values),
    title="Relative difference between outgoing (+) and incoming (-) messages by chat",
    x_label="Chat", y_label="%"
)

In [None]:
df_balance = df.copy()
df_balance = df_balance.groupby(["conversationWithName", "outgoing"]).sum().reset_index().pivot(index="conversationWithName", columns="outgoing", values="word_count")
df_balance = (df_balance[True] - df_balance[False]) / (df_balance[True] + df_balance[False]) * 100
pltfy(
    sns.barplot(x=df_balance.index, y=df_balance.values),
    title="Relative difference between outgoing (+) and incoming (-) total words by chat",
    x_label="Chat", y_label="%"
)

In [None]:
df_dof = df.groupby(["conversationWithName", df["timestamp"].dt.hour]).count()["text"].reset_index().pivot(index="conversationWithName", columns="timestamp", values="text").fillna(0)
df_dof = df_dof.div(df_dof.sum(axis=1), axis=0)
pltfy(
    sns.heatmap(df_dof, cmap="YlGnBu"),
    title="Distribution of messages per chat by hour of day",
    x_label="Hour of day", y_label="Chat"
)

In [None]:
df_dof = df.groupby(["conversationWithName", df["timestamp"].dt.weekday]).count()["text"].reset_index().pivot(index="conversationWithName", columns="timestamp", values="text").fillna(0)
df_dof = df_dof.div(df_dof.sum(axis=1), axis=0)
pltfy(
    sns.heatmap(df_dof, cmap="YlGnBu"),
    title="Distribution of messages per chat by day of week",
    x_label="Day of week", y_label="Chat"
)