### Importing necessary libraries

In [None]:
# for data managing
import pandas as pd 
import numpy as np

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for custom fonts
import matplotlib.font_manager as font_manager

# for data preprocessing
import re 
import nltk
from nltk.corpus import stopwords

# for wordcloud
import stylecloud

# calendar heatmap
np.random.seed(sum(map(ord, 'calmap')))
import calmap

### Cleaning up the data

In [None]:
from helpers import format_chat, set_datatypes

In [None]:
df = format_chat('_chat.txt')
df = set_datatypes(df)

### Creating useful columns


In [None]:
from helpers import get_emoji

In [None]:
# time distribution
df['hour'] = df['time'].dt.hour
df['day'] = df['time'].dt.day.astype('category')
df['month'] = df['time'].dt.month.astype('category')
df['year'] = df['time'].dt.year.astype('category')
df['weekday'] = df['time'].dt.day_name().astype('category')

# message length and word count
df['message_length'] = df['message'].apply(lambda x: len(x))
df['word_count'] = df['message'].apply(lambda x: len(x.split()))

# get emojis from message: INSERT YOUR LANGUAGE HERE (e.g. 'en')
df['emojis'] = df['message'].apply(lambda x: get_emoji(x, language='es'))

# get links
df['links'] = df['message'].apply(lambda x: ' '.join(re.findall(r'(https?://\S+)', x)))

### EDA

#### Visualization Configuration

In [None]:
# import font, color palette and style for plots
from config import VisualConfig

#### Number of messages per user

In [None]:
# pie chart of the number of messages per agent using matplotlib
plt.pie(df['user'].value_counts(), labels=df['user'].value_counts().index, autopct='%1.1f%%')
plt.title('Number of messages per user')
plt.show()

#### Message length per user

In [None]:
# plot the average message length per user and the average word count per user
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(12, 6)
sns.barplot(x='user', y='message_length', data=df, ax=ax[0])
sns.barplot(x='user', y='word_count', data=df, ax=ax[1])
ax[0].set_title('Average message length')
ax[1].set_title('Average word count')
plt.show()

#### Number of messages per hour (per user)

In [None]:
# using seaborn
sns.countplot(x='hour', hue='user', data=df)
plt.title('Number of messages per hour')
plt.show()

#### Messages per day of the week

In [None]:
# histogram of number of messages per day of the week
sns.countplot(x='weekday', data=df, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
# make x-axis labels vertical
plt.xticks(rotation=90)
plt.title('Number of messages per day of the week')
plt.show()

### Message heatmap

In [None]:
fig = plt.figure(figsize=(20,8))
calmap.yearplot(pd.Series(df.groupby(df.time).size()), year=2022, cmap='Greens')
plt.suptitle('Calendar Heatmap', y=.7, fontsize=20)
plt.show()

#### Weekday Histogram

In [None]:
palette = VisualConfig.palette
df["q"] = 1

In [None]:
def weighted_hist(x, weights, **kwargs):
    sns.kdeplot(x, weights=weights, **kwargs)

sns.set(style='white', rc={'axes.facecolor': (0, 0, 0, 0)})
g = sns.FacetGrid(df, row='weekday', hue='weekday', aspect=20, height=1.5, palette=palette)

# draw the densities
g.map(weighted_hist, 'hour', 'q', clip_on=False, shade=True, alpha=1, lw=1.5, bw_method=.2)
g.map(weighted_hist, 'hour', 'q', clip_on=False, color='w', lw=3, bw_method=.2)
g.map(plt.axhline, y=0, lw=1, clip_on=False)

def label(x, color, label):
    ax = plt.gca()
    ax.text(0, 0.1, label, fontweight='bold', color=color, 
            ha='left', va='center', transform=ax.transAxes, 
            size=30)

g.map(label, 'hour')
g.set_xlabels('Time of day', fontsize=35)
g.set_xticklabels(fontsize=25)

# set the subplots to overlap
g.fig.subplots_adjust(hspace=-0.5)
g.fig.suptitle('Weekday Histogram', fontsize=35)   
g.set_titles('')
g.set(yticks=[])
g.set(ylabel=None)
g.despine(bottom=True, left=True)
g.set(xticks=np.arange(0, 24, 1))
plt.show()

### WordCloud

In [None]:
from helpers import clean_text

In [None]:
text = ' '.join(df['message'].tolist())

In [None]:
tokens = clean_text(text)

In [None]:
stopwords = set(stopwords.words('spanish'))
newStopWords = ['audio','omitido','sticker','omitida','imagen', 'u200esticker', 'u200eaudio', 
                'si', 'jaja', 'dale', 'jajaja', 'voy', 'dalee', 'bien', 'ahi', 'va', 'igual', 
                'eu', 'sisi', 'q', 'videollamada', 'perdida', 'ok', 'oka']
stopwords.update(newStopWords)

# remove stopwords from tokens
tokens = [word for word in tokens if not word in stopwords]

In [None]:
# plot top 20 words
fdist = nltk.FreqDist(tokens)
# plot using matplotlib
plt.title('Top 20 words')
# include font to be lato
plt.rcParams['font.family'] = 'Lato'
fdist.plot(20, cumulative=False, color=palette[-1])
plt.show()

In [None]:
stylecloud.gen_stylecloud(text=' '.join(tokens),
                          icon_name='fas fa-cloud',
                          background_color='white',
                          colors = palette.as_hex(),
                          gradient='horizontal',
                          font_path='fonts\Lato-Regular.ttf')
                          
# remove axis and borders
plt.axis('off')
plt.imshow(plt.imread('stylecloud.png'))
plt.show()

# Dataframe to desireded output

In [None]:
df.to_csv('hola_ktal.csv', sep = ';')