# Imports

In [None]:
import os
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import wa_parser
import sc_parser
from os import path
from wordcloud import WordCloud, STOPWORDS

FILEPATH = r'./chats/chat.txt'
BACKGROUND_COLOR = 'white'
PRIMARY_COLOR = '#e76f51'
LOOKBACK_WEEKS = 104
ROLLING_INTERVAL = 14

plt.rcParams["figure.facecolor"] = BACKGROUND_COLOR
plt.rcParams["axes.facecolor"] = BACKGROUND_COLOR
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.left"] = False
plt.rcParams["axes.spines.bottom"] = False

nlp = spacy.load('de_core_news_sm')

# Data Loading

In [None]:
from visualizations_setup import DATEFORMAT, TIMEFORMAT

df_wa = wa_parser.get_df_from_chatlog(FILEPATH, dateformat = DATEFORMAT, \
                        timeformat = TIMEFORMAT)
df_wa

In [None]:
df_sc = sc_parser.get_df_from_chatlog()
df_sc

In [None]:
df = df_wa.append(df_sc, ignore_index = True)
df

# Visualizations

## Lineplot - Messages per Day

In [None]:
df_daily_mess = df.groupby(by = df['datetime'].dt.date)['message'].count().reset_index()
df_daily_mess["messsagecount_rm"] = df_daily_mess["message"].rolling(ROLLING_INTERVAL, 
                                                                     min_periods = 3, center = True).mean()

In [None]:
fig, ax = plt.subplots(figsize = (8, 3.5))
sns.lineplot(data = df_daily_mess, x = 'datetime', y = 'messsagecount_rm',
             ax = ax, color = PRIMARY_COLOR, lw = 2)
ax.set_ylim(0, np.ceil(max(df_daily_mess['messsagecount_rm'].values)))
# ax.set_xlim([datetime.date(2019, 11, 1), datetime.date(2021, 12, 1)])
ax.tick_params(axis = 'x', rotation = 45)
ax.set_title(f"Messages per Day ({ROLLING_INTERVAL} Days Rolling Mean)", fontdict = {"fontsize": 15})
ax.set_xlabel(None)
ax.set_ylabel(None)
#plt.legend(frameon = False, bbox_to_anchor = (1.04,0.5), loc = "center left")
plt.tight_layout()

## Sunburstchart - Messages per Daytime

In [None]:
df_circle = df.groupby(by = 'hour')['message'].count().reset_index()

time = df_circle['hour']
count = df_circle['message'].to_numpy()

hours_in_day = 24
circle_length = 2 * np.pi

f = plt.figure(figsize = (8, 4))
ax = plt.subplot(111, projection = "polar")

x = np.arange(0, circle_length, circle_length / len(count)) + circle_length / (2 * len(count))

bars = ax.bar(x, count, width = circle_length / len(count),
              alpha = 0.4, color = PRIMARY_COLOR, bottom = 0)

max_ind = np.argmax(count)
ax.bar(x[max_ind],count[max_ind], bottom = 0,
       width = 2 * np.pi / len(count), alpha = 1, color = PRIMARY_COLOR)

ax.bar(x, np.max(count) * np.ones(len(count)), width = 2 * np.pi / len(count),
    alpha = 0.15, bottom = 0, color = BACKGROUND_COLOR, edgecolor = "black")

ax.set_theta_direction(-1)
ax.grid(False)
ax.spines["polar"].set_visible(False)
ax.set_theta_offset(np.pi / 2)
ax.set_xticks(np.linspace(0, circle_length, hours_in_day, endpoint = False))
ticks = [str(i) for i in range(hours_in_day)]
ax.set_xticklabels(ticks)
ax.set_title("Messages per Daytime", fontdict = {"fontsize": 15})
plt.setp(ax.get_yticklabels(), visible = False)
plt.tight_layout()

## WordCloud

In [None]:
from visualizations_setup import MEDIA_FLAG

messages = [word.split() for word in df["message"].values if not MEDIA_FLAG in word]
words = [word.lower() for sublist in messages for word in sublist]

stopwords = STOPWORDS.update(['media', 'omitted', 'missed', 'voice', 'call', 'http', 'https'])
for word in ['media', 'omitted', 'missed', 'voice', 'call', 'http', 'https']:
    nlp.Defaults.stop_words.add(word)

In [None]:
def valid_token(token):
    token_lemma = token.lemma_
    is_link = token_lemma.startswith("https") or token_lemma.startswith("http")
    return not (token.is_stop or token.is_punct or is_link)

In [None]:
lemmatized_words = []
tokens = nlp(" ".join(words))
lemmatized_words = [token.lemma_ for token in tokens if valid_token(token)]

In [None]:
wordcloud = WordCloud(stopwords = stopwords, max_font_size = 90, width = 800, height = 400,
                      background_color = BACKGROUND_COLOR, colormap = 'autumn', min_word_length = 3,
                      max_words = 400, min_font_size = 12).generate(' '.join(lemmatized_words))
plt.figure(figsize = (8, 4))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.tight_layout()

In [None]:
from mask import create_masked
create_masked(" ".join(lemmatized_words), "wc_heart", contour_width = 0, colormap = "autumn")

In [None]:
from visualizations_setup import is_emoji, split_count
emoji_list = [word for word in words if any(is_emoji(char) for char in word)]
emoji_count = split_count(str.join(" ", emoji_list))
em = []
for key, value in emoji_count.items():
    for _ in range(value):
        em.append(key)
em

from visualizations_setup import replacements
emoji_text = " ".join(em)
for key, value in replacements.items():
    emoji_text = emoji_text.replace(key, value)

In [None]:
from visualizations_setup import EmojiCloud, FONT_PATH
emoji_cloud = EmojiCloud(font_path = FONT_PATH)
emoji_cloud.generate(emoji_text)

# Statistics

Total Words

In [None]:
df["words"].sum()

Total Messages

In [None]:
df.count()

Avg. Words per Message

In [None]:
df["words"].sum() / df.count()

Unique Words

In [None]:
messages = [word.split() for word in df["message"].values]
words = [word.lower() for sublist in messages for word in sublist]
len(set(words))