# Packages

In [2]:
import pandas as pd
import os
import re
import datetime as dt
pd.options.plotting.backend = "plotly"

#  Parameters

In [20]:
PATH_INPUT_DATA = os.path.join('.','data','input')
INPUT_DATA_NAME = 'Chat de WhatsApp con C7.txt'

PATH_OUTPUT_DATA = os.path.join('.','data','output')
OUTPUT_DATA_NAME = 'chat_data.csv'

#  Functions

In [4]:
def starts_with_date(s):
#     pattern = '^([0-3][0-9]|(3)[0-1])(\/)(([0-9])|([0-2][0-2]))(\/)(\d{2}|\d{4}) ([0-9][0-9]):([0-9][0-9]) -'

    # Date patters
    pattern = '([0-3]?\d\/{1})([01]?\d\/{1})([12]{1}\d{1}) ([0-9][0-9]):([0-9][0-9]) -'
    
    # Does it match
    result = re.match(pattern, s)
    
    return result

In [5]:
def split_data_message(s):
    
    # Split tokes of message
    date, time, author_message = re.split(' - | |', s, 2) 
    author, message = re.split(': ', author_message, 1)
    
    return date, time, author, message

In [6]:
def process_chat_data(chat_data):
    
    parsed_data = [] # List to keep track of data so it can be used by a Pandas dataframe
    
    message_buffer = [] # Buffer to capture intermediate output for multi-line messages
    
    for line in chat_data:
        
        if starts_with_date(line):
            if len(message_buffer) > 0: # Check if the message buffer contains characters from previous iterations
                parsed_data.append([date, time, author, ' '.join(message_buffer)]) # Save the tokens from the previous message in parsedData
                message_buffer.clear() # Clear the message buffer so that it can be used for the next message
                
            date, time, author, message = split_data_message(line) # Identify and extract tokens from the line
            message_buffer.append(message)
        
        else: # If a line doesn't start with a Date Time pattern, 
            message_buffer.append(line) # then it is part of a multi-line message. So, just append to buffer
        
    # Create dataframe
    chat_data_proc = pd.DataFrame(parsed_data, columns=['date', 'time', 'author', 'message'])
    
    return chat_data_proc        

In [7]:
def clean_processed_chat(chat_proc):
    
    chat_proc.date = pd.datetime(chat_proc.date)
    

# Data Import

In [8]:
chat_raw = pd.read_table(os.path.join(PATH_INPUT_DATA, INPUT_DATA_NAME), header=None, names=['chat_line'], dtype='str')

In [9]:
chat_raw

Unnamed: 0,chat_line
0,30/5/20 00:21 - Bruno Pascucci: Genial
1,30/5/20 11:58 - Agus Figueroa C7: https://twit...
2,30/5/20 11:58 - Agus Figueroa C7: ?
3,30/5/20 12:01 - Franco Bona: Supuestamente
4,30/5/20 12:01 - Nick Teperman: A qué hora acá??
...,...
827,Feliz día!!🔧
828,6/6/20 18:53 - Tomás Bianchi: Fantastico
829,6/6/20 18:55 - Mauricio Petaccia: Jajajajajaja
830,6/6/20 18:55 - Mauricio Petaccia: Jajajajajaja


In [15]:
chat_processed = process_chat_data(chat_raw['chat_line'])

In [18]:
chat_processed.author.unique()

array(['Bruno Pascucci', 'Agus Figueroa C7', 'Franco Bona',
       'Nick Teperman', 'Pato C7', 'Fede Masciliano', 'Tomás Bianchi',
       'Baco C7', 'Cecilia García L M', 'Fede Font', 'Mauricio Petaccia',
       'Joaquin Del Prieto', 'Pauli Mazza Acn', 'Camila Rodriguez C7',
       'Camilo Leonel Amadio', 'Albi C7', 'Juli Scipioni', 'Ger C7',
       'Nico García Aramouni', 'Felipe De Feo', 'Ini C7', 'Neidym C7',
       'Fran Lonardi', 'Paula C7', 'Vari C7', 'Clara Graham',
       'Caro Colombo', 'Nano C7', 'Facundo Scasso', 'Juani Acn',
       'Johi C7', 'Pili Aenlle', 'Agus Casas', 'Ignacio Brottier',
       'Lesley Reiderman', 'Gabi C7', 'Mateo Dennehy ITBA',
       'Lucia Monchiero', 'Tomy C7', 'Juan Juan Ferraro', 'Carla C7',
       'Martín Turjanski', 'Lola Ferraro', 'Sebastian Gutman',
       '+54 9 11 3174-1032', 'Beian Bohe Usa'], dtype=object)

#  Export

In [21]:
chat_processed.to_csv(os.path.join(PATH_OUTPUT_DATA, OUTPUT_DATA_NAME))