# Preprocessing

In [90]:
# Imports
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

regex = re.compile('\d*\/\d*\/\d*,\s\d*:\d*\s[AP][M]\s-\s[A-Za-z\d\s]*:\s')
# This is the same as the previous except for a capturing group for the name labels in the chat txt file
regex_name = re.compile('\d*\/\d*\/\d*,\s\d*:\d*\s[AP][M]\s-\s([A-Za-z\d\s]*):\s')

out_file = 'messages.csv'

## Read and clean data

The code looks for a *data.txt* file exported directly from WhatsApp. Open a chat with a contact, go to the menu on the top right (three dots). Expand *More* and click *Export Chat*. You can export it without media since the code only needs the text file.

### Some notes:

* Replace all new lines in the text file so it is basically uninterrupted text. We can pattern match the timestamps to identify new messages
* We can also replace the matched timestamps with a separator of our choice
* Multi-line messages in the original chat will be represented in a single-line
* We need to iterate through the dataset initially and store the names of the people in the chat so we can label messages later on

----

* Why parse timestamps into a set/dictionary? Each timestamp is not unique - WhatsApp does mm/dd/yy, hh:mm AM/PM timestamps. Multiple messages possible with same timestamp BUT names might be different
* Having a dictionary helps us easily replace the massive exported chat with a simple but time-consuming and inefficient 'replace key with value' operation. 

In [81]:
datafile = 'data.txt'

timestamps = {}
names = {}

with open(datafile, 'r') as text:
    data = text.read().replace('\n', '')
    matches = re.findall(regex, data)
    namelist = set(re.findall(regex_name, data))

# Set tags for names to replace and build dataset
tag = 1
for name in namelist:
    names[name] = tag
    tag += 1

# Set timestamps to be replaced
for match in matches:
    tag = 0 # tag not found
    for name in names:
        tag = names[name] if name in match else tag
    timestamps[match] = '\n{}, '.format(str(tag))

timestamps

{'12/22/18, 2:45 AM - Prati: ': '\n2, ',
 '12/22/18, 2:57 AM - Aadith: ': '\n1, ',
 '12/22/18, 3:20 AM - Prati: ': '\n2, ',
 '12/22/18, 3:22 AM - Prati: ': '\n2, ',
 '12/22/18, 3:26 AM - Prati: ': '\n2, ',
 '12/22/18, 4:04 AM - Aadith: ': '\n1, ',
 '12/22/18, 4:05 AM - Prati: ': '\n2, ',
 '12/22/18, 4:06 AM - Prati: ': '\n2, ',
 '12/22/18, 6:38 AM - Aadith: ': '\n1, ',
 '12/22/18, 8:27 AM - Prati: ': '\n2, ',
 '12/22/18, 8:42 AM - Prati: ': '\n2, ',
 '12/22/18, 9:15 AM - Prati: ': '\n2, ',
 '12/22/18, 10:05 AM - Prati: ': '\n2, ',
 '12/22/18, 10:06 AM - Prati: ': '\n2, ',
 '12/22/18, 10:07 AM - Prati: ': '\n2, ',
 '12/22/18, 10:21 AM - Prati: ': '\n2, ',
 '12/23/18, 2:53 AM - Prati: ': '\n2, ',
 '12/23/18, 2:54 AM - Prati: ': '\n2, ',
 '12/23/18, 2:59 AM - Aadith: ': '\n1, ',
 '12/23/18, 3:01 AM - Prati: ': '\n2, ',
 '12/23/18, 3:37 AM - Aadith: ': '\n1, ',
 '12/23/18, 5:02 AM - Aadith: ': '\n1, ',
 '12/23/18, 6:02 AM - Aadith: ': '\n1, ',
 '12/23/18, 6:02 AM - Prati: ': '\n2, ',
 '12/

In [83]:
for timestamp in timestamps:
    data = data.replace(timestamp, timestamps[timestamp])

data



In [118]:
# Clean up, remove encryption notification at start of chat, remove media messages
# Save as CSV
messages = pd.DataFrame([[i[:1], i[2:]] for i in data.split('\n')[1:]])
messages.columns = ['tag', 'message']
messages = messages[~messages['message'].str.contains('Media omitted')].reset_index(drop=True)

messages.to_csv(out_file)

messages

Unnamed: 0,tag,message
0,2,You're the love of my life😚
1,1,So are you
2,1,Seat numberrrr
3,2,Ulla vantu mela paaru
4,2,Screen 7 baby
...,...,...
61290,2,Enna padipa
61291,1,There is a Spark course
61292,1,That I bought
61293,1,For $10


# Analysis

In [122]:
data = pd.read_csv(out_file, index_col=0)
data.head()

Unnamed: 0,tag,message
0,2,You're the love of my life😚
1,1,So are you
2,1,Seat numberrrr
3,2,Ulla vantu mela paaru
4,2,Screen 7 baby
