# Packages

In [6]:
import pandas as pd
import os
import re
import datetime as dt
pd.options.plotting.backend = "plotly"
import plotly.graph_objects as go
# import plotly.express as px
from plotly.subplots import make_subplots

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

#  Parameters

In [7]:
PATH_INPUT_DATA = os.path.join('..','data','input')
INPUT_DATA_NAME = 'Chat de WhatsApp con C7.txt'

PATH_OUTPUT_DATA = os.path.join('..','data','output')
OUTPUT_DATA_NAME = 'chat_data.csv'

#  Import data

In [8]:
chat_data = pd.read_csv(os.path.join(PATH_OUTPUT_DATA, OUTPUT_DATA_NAME))

# Preprocess data

In [9]:
def process_data(data):
    data.loc[:,'datetime'] = pd.to_datetime(data.loc[:,'date'] + ' ' + data.loc[:,'time'])
    data.loc[:,'hour'] = data.loc[:,'datetime'].dt.hour
    data.loc[:,'weekday'] = data.loc[:,'datetime'].dt.weekday
    list_authors = data.loc[:,'author'].unique().tolist()
    
    return data, list_authors

In [11]:
chat_data, list_authors = process_data(chat_data)

## Authors

In [13]:
def plot_messages_by_author(n_authors=1.0):
    author_value_counts = chat_data['author'].value_counts() # Number of messages per author
    
    top_n_author_value_counts = author_value_counts.head(n_authors).sort_values(ascending=True) # Number of messages per author for the top 10 most active authors
    
    authors = top_n_author_value_counts.index.values
    count_messages = top_n_author_value_counts.values
    
    fig = go.Bar(x=count_messages, y=authors, orientation='h', showlegend=False)
    
    return fig

In [14]:
def plot_avg_line():
    
    mean_counts = chat_data['author'].value_counts().unique().mean()
    
    line = go.Scatter(y= [0, 1],
                        x= [mean_counts, mean_counts],
                        mode= 'lines',
                        showlegend= True,
                        hoverinfo='none',
                        name='Average # Messages/Author')
    
    return line

In [15]:
def plot_messages(n_authors):
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    fig.add_trace(plot_messages_by_author(n_authors), 1, 1, secondary_y=False)
    fig.add_trace(plot_avg_line(), 1, 1, secondary_y=True)
    
    fig.update_layout(width=800, height=800, 
                      legend= dict(x=0, y=-0.1),
                      yaxis2= dict(fixedrange= True, range= [0, 1], visible= False))
    
    fig.show()

In [16]:
interact(plot_messages, n_authors=widgets.IntSlider(min=1, max=len(list_authors), step=1, value=10))

interactive(children=(IntSlider(value=10, description='n_authors', max=71, min=1), Output()), _dom_classes=('w…

<function __main__.plot_messages(n_authors)>

## Messages

In [17]:
def plot_messages_by_date(author='Everyone'):
    if author=='Everyone':
        series_to_plot = chat_data.loc[:,['date','message']].groupby('date').count().reset_index()
    else:
        series_to_plot = chat_data.loc[
            chat_data.loc[:,'author'] == author, 
            ['date','message']].groupby('date').count().reset_index()
        
    dates = series_to_plot.loc[:,'date']
    count_messages = series_to_plot.loc[:,'message']

    fig = go.Figure(data=go.Scatter(x=dates, y=count_messages, mode='lines+markers'))
    
    fig.show()

In [18]:
interactive(plot_messages_by_date, author=['Everyone'] + chat_data.author.sort_values().unique().tolist())

interactive(children=(Dropdown(description='author', options=('Everyone', '+54 9 11 3174-1032', '+54 9 11 5485…

## Time

In [19]:
def plot_active_hours(author='Everyone'):    
    if author=='Everyone':
        series_to_plot = chat_data.loc[:,['date','hour','message']]
    else:
        series_to_plot = chat_data.loc[
            chat_data.loc[:,'author'] == author, 
            ['date','hour','message']]
        
    series_to_plot = series_to_plot.groupby(['date','hour'])['message'].count().groupby('hour').mean().reset_index()
    
    fig = go.Bar(x=series_to_plot.hour, y=series_to_plot.message, 
                 orientation='v')
    
    return fig

In [20]:
def plot_active_weekdays(author='Everyone'):    
    if author=='Everyone':
        series_to_plot = chat_data.loc[:,['date','weekday','message']]
    else:
        series_to_plot = chat_data.loc[
            chat_data.loc[:,'author'] == author, 
            ['date','weekday','message']]
    
    series_to_plot = series_to_plot.groupby(['date','weekday'])['message'].count().groupby('weekday').mean().sort_index(ascending=False)
    
    dict_day_name={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}

    series_to_plot.index = series_to_plot.index.map(dict_day_name)
    authors = series_to_plot.index.values
    count_messages = series_to_plot.values
    
    fig = go.Bar(x=count_messages, y=authors,  
                 orientation='h')
    
    return fig

In [21]:
def plot_active_hours_days(author='Everyone'):    
    
    fig = make_subplots(rows=1, cols=2, 
                        column_widths = [0.7,0.3],
                        subplot_titles=(['Avg # Messages/hour', 'Avg # Messages/day']))
    
    fig.add_trace(
        plot_active_hours(author=author),
        row=1, col=1
    )
    
    fig.add_trace(
        plot_active_weekdays(author=author),
        row=1, col=2
    )
    
    fig['layout']['xaxis1'].update(
        title='Hour'
    )
    fig['layout']['yaxis2'].update(
#         title='Día',
        side='right'
    )
    
    fig.update_layout(height=400, width=900, 
                      showlegend=False
#                       ,title_text="Time behaviour"
                     )
    fig.show()

In [22]:
interactive(plot_active_hours_days, author=['Everyone'] + chat_data.author.sort_values().unique().tolist())

interactive(children=(Dropdown(description='author', options=('Everyone', '+54 9 11 3174-1032', '+54 9 11 5485…