# Packages

In [1]:
import pandas as pd
import numpy as np
import os
import re
import datetime as dt
pd.options.plotting.backend = "plotly"

#  Parameters

In [2]:
PATH_INPUT_DATA = os.path.join('..','data','input')
INPUT_DATA_NAME = 'Chat de WhatsApp con C7.txt'

PATH_OUTPUT_DATA = os.path.join('..','data','output')
OUTPUT_DATA_NAME = 'chat_data.csv'

#  Functions

In [3]:
def starts_with_date(s):
    
    # Check if string is not nan
    if not pd.isnull(s):

        # Date patters
        pattern = '([0-3]?\d\/{1})([01]?\d\/{1})([12]{1}\d{1}) ([0-9][0-9]):([0-9][0-9]) -*'

        # Does it match
        result = re.match(pattern, s)
    
    else:
        result = False

    if result and result.start() == 0: # Matches and starts with patters
        return True
    else:
        return False

In [4]:
def has_message(s):
    # Checks if there's a :
    result = ':' in s
    
    return result

In [5]:
def split_data_message(s):
    
    # Split tokes of message
    date, time, author_message = re.split(' - | |', s, 2) 
    
    if has_message(author_message):
        author, message = re.split(': ', author_message, 1)
    else:
        author, message = 'Whatsapp messages', author_message
    
    return date, time, author, message

In [6]:
def process_chat_data(chat_data):
    
    parsed_data = [] # List to keep track of data so it can be used by a Pandas dataframe
    
    message_buffer = [] # Buffer to capture intermediate output for multi-line messages
    
    for line in chat_data:
        
        if starts_with_date(line):

            if len(message_buffer) > 0: # Check if the message buffer contains characters from previous iterations
                message_buffer_clean = [mes for mes in message_buffer if not pd.isnull(mes)] # Clean message from nan values
                parsed_data.append([date, time, author, ' '.join(message_buffer_clean)]) # Save the tokens from the previous message in parsedData
                message_buffer.clear() # Clear the message buffer so that it can be used for the next message
                
            date, time, author, message = split_data_message(line) # Identify and extract tokens from the line
            message_buffer.append(message)
        
        else: # If a line doesn't start with a Date Time pattern, 
            message_buffer.append(line) # then it is part of a multi-line message. So, just append to buffer
        
    # Create dataframe
    chat_data_proc = pd.DataFrame(parsed_data, columns=['date', 'time', 'author', 'message'])
    chat_data_proc.set_index(['date', 'time'])
    
    return chat_data_proc        

In [7]:
def clean_processed_chat(chat_proc):
    
    chat_proc.date = chat_proc.date.apply(lambda date: dt.datetime.strptime(date, "%d/%m/%y").strftime("%Y-%m-%d"))
    

# Data Import

In [8]:
chat_raw = pd.read_table(os.path.join(PATH_INPUT_DATA, INPUT_DATA_NAME), header=None, names=['chat_line'], dtype='str')

In [9]:
chat_raw

Unnamed: 0,chat_line
0,"9/5/19 11:46 - Nick Teperman: Bueno, estas son..."
1,4 Goulash Cris Bat칤 matt Ger
2,2 lentejas
3,2 pastel de papa
4,4 canelones de pollo y verdura
...,...
50951,Feliz d칤a!!游댢
50952,6/6/20 18:53 - Tom치s Bianchi: Fantastico
50953,6/6/20 18:55 - Mauricio Petaccia: Jajajajajaja
50954,6/6/20 18:55 - Mauricio Petaccia: Jajajajajaja


In [10]:
chat_processed = process_chat_data(chat_raw['chat_line'])


split() requires a non-empty pattern match.



In [11]:
chat_processed

Unnamed: 0,date,time,author,message
0,9/5/19,11:46,Nick Teperman,"Bueno, estas son las cantidades: anoten nombre..."
1,9/5/19,11:48,Agus Figueroa C7,*pido sale pepe*
2,9/5/19,11:50,Pauli Mazza Acn,"Bueno, estas son las cantidades: anoten nombre..."
3,9/5/19,11:53,Cristian Kubrak,Se pide
4,9/5/19,12:00,Tom치s Bianchi,Ya pidieron todo?
...,...,...,...,...
39993,6/6/20,17:28,Agus Figueroa C7,;)
39994,6/6/20,18:51,Juan Juan Ferraro,https://www.youtube.com/watch?v=TGG4NLzT4Po Fe...
39995,6/6/20,18:53,Tom치s Bianchi,Fantastico
39996,6/6/20,18:55,Mauricio Petaccia,Jajajajajaja


In [12]:
chat_processed.author.unique()

array(['Nick Teperman', 'Agus Figueroa C7', 'Pauli Mazza Acn',
       'Cristian Kubrak', 'Tom치s Bianchi', 'Nico Garc칤a Aramouni',
       'Facundo Scasso', 'Baco C7', 'Albi C7', 'Pato C7', 'Blas Leiro',
       'Joaquin Del Prieto', 'Nano C7', 'Tomy C7', 'Ger C7',
       'Lola Ferraro', 'Gonzalo Berasaluce', 'Fran Catania', 'Juani Acn',
       'Rama C7', 'Camila Rodriguez C7', 'Fede Masciliano', 'Pili Aenlle',
       'Cecilia Garc칤a L M', 'Vari C7', 'Tomas Irazoqui Acn', 'Paula C7',
       'Johi C7', 'Fran Lonardi', 'Felipe De Feo', 'Ini C7', 'Eva Acn',
       'Mateo Dennehy ITBA', 'Juan Juan Ferraro', 'Juli Scipioni',
       'Agus Casas', 'Beian Bohe Usa', 'Negro C7', 'Ale Bessel Acn',
       'Sebastian Gutman', 'Carla C7', 'Nico Acn', 'Whatsapp messages',
       'Federico Iglesias', 'Lali Acc', 'Fede Font', 'Ignacio Brottier',
       'Meli Acn', 'Neidym C7', '+54 9 11 5907-8127', 'Mauricio Petaccia',
       'Ianina Hutler', 'Bruno Pascucci', 'Camilo Leonel Amadio',
       '+54 9 11 548

In [13]:
clean_processed_chat(chat_processed)

In [14]:
chat_processed

Unnamed: 0,date,time,author,message
0,2019-05-09,11:46,Nick Teperman,"Bueno, estas son las cantidades: anoten nombre..."
1,2019-05-09,11:48,Agus Figueroa C7,*pido sale pepe*
2,2019-05-09,11:50,Pauli Mazza Acn,"Bueno, estas son las cantidades: anoten nombre..."
3,2019-05-09,11:53,Cristian Kubrak,Se pide
4,2019-05-09,12:00,Tom치s Bianchi,Ya pidieron todo?
...,...,...,...,...
39993,2020-06-06,17:28,Agus Figueroa C7,;)
39994,2020-06-06,18:51,Juan Juan Ferraro,https://www.youtube.com/watch?v=TGG4NLzT4Po Fe...
39995,2020-06-06,18:53,Tom치s Bianchi,Fantastico
39996,2020-06-06,18:55,Mauricio Petaccia,Jajajajajaja


#  Export

In [15]:
chat_processed.to_csv(os.path.join(PATH_OUTPUT_DATA, OUTPUT_DATA_NAME), index=False)