# Preprocessing

In [None]:
# Imports
import pandas as pd
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from collections import Counter

plt.rcParams['figure.figsize'] = [15, 10]
sns.set(style='white', context='talk')

# Use a different font if you want. Symbola supports all emojis
matplotlib.rcParams['font.family'] = 'Symbola'

# How many words do you wanna compare on the graph?
top_word_n = 15

# Define color palette here
custom_palette = sns.dark_palette('#eb348c', n_colors=top_word_n, reverse=True)

stop = stopwords.words('english')
# Regex for removing the WhatsApp timestamp
regex = re.compile('\d*\/\d*\/\d*,\s\d*:\d*\s[AP][M]\s-\s[A-Za-z\d\s]*:\s')
# This is the same as the previous except for a capturing group for the name labels in the chat txt file
regex_name = re.compile('\d*\/\d*\/\d*,\s\d*:\d*\s[AP][M]\s-\s([A-Za-z\d\s]*):\s')

out_file = 'messages.csv'

In [None]:
# Adding custom stopwords here
stop.extend([

])

## Read and clean data

The code looks for a *data.txt* file exported directly from WhatsApp. Open a chat with a contact, go to the menu on the top right (three dots). Expand *More* and click *Export Chat*. You can export it without media since the code only needs the text file.

### Some notes:

* Replace all new lines in the text file so it is basically uninterrupted text. We can pattern match the timestamps to identify new messages
* We can also replace the matched timestamps with a separator of our choice
* Multi-line messages in the original chat will be represented in a single-line
* We need to iterate through the dataset initially and store the names of the people in the chat so we can label messages later on

----

* Why parse timestamps into a set/dictionary? Each timestamp is not unique - WhatsApp does mm/dd/yy, hh:mm AM/PM timestamps. Multiple messages possible with same timestamp BUT names might be different
* Having a dictionary helps us easily replace the massive exported chat with a simple but time-consuming and inefficient 'replace key with value' operation. 

In [None]:
datafile = 'data.txt'

timestamps = {}
names = {}
tags = {}

with open(datafile, 'r') as text:
    data = text.read().replace('\n', '')
    matches = re.findall(regex, data)
    namelist = set(re.findall(regex_name, data))

# Set tags for names to replace and build dataset
tag = 1
for name in namelist:
    names[name] = tag
    tag += 1

# Set timestamps to be replaced
for match in matches:
    tag = 0 # tag not found
    for name in names:
        tag = names[name] if name in match else tag
    timestamps[match] = '\n{}, '.format(str(tag))

# Lookup for tag -> name
for name, tag in names.items():
    tags[tag] = name

timestamps

In [None]:
for timestamp in timestamps:
    data = data.replace(timestamp, timestamps[timestamp])

data

In [None]:
# Clean up, remove encryption notification at start of chat, remove media messages
# Save as CSV
messages = pd.DataFrame([[i[:1], i[2:]] for i in data.split('\n')[1:]])
messages.columns = ['tag', 'message']
messages = messages[~messages['message'].str.contains('Media omitted')].reset_index(drop=True)

messages.to_csv(out_file)

messages

# Analysis

In [None]:
data = pd.read_csv(out_file, index_col=0)
data.head()

In [None]:
# Define names for the tags here
tags = {1: '', 2: ''}
names = {'': 1, '': 2}

In [None]:
"{} sent {} messages and {} sent {} messages".format(tags[1], data[data.tag == 1].shape[0],
                                                     tags[2], data[data.tag == 2].shape[0])

In [None]:
user_a = data[data.tag == 1]
user_b = data[data.tag == 2]

### User A

In [None]:
user_a['clean_message'] = user_a['message'].apply(lambda x: [word.lower() for word in x.split() if word.lower() not in stop])
user_a

In [None]:
word_freqs = Counter()
for words in user_a['clean_message']:
    counts = Counter(words)
    word_freqs += counts

final_counts = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)

In [None]:
final_counts

In [None]:
user_a_counts = final_counts

In [None]:
top_words = [i[0] for i in user_a_counts[:top_word_n]]
top_counts = [i[1] for i in user_a_counts[:top_word_n]]

sns.barplot(top_words, top_counts, palette=custom_palette)
plt.yticks([], [])
plt.xticks(fontsize=10)
plt.gcf().subplots_adjust(bottom=0.35)
plt.box(False)
plt.savefig('output_user_a.png', dpi=200, bbox_inches='tight')

### User B

In [None]:
user_b['clean_message'] = user_b['message'].apply(lambda x: [word.lower() for word in x.split() if word.lower() not in stop])
user_b

In [None]:
word_freqs = Counter()
for words in user_b['clean_message']:
    counts = Counter(words)
    word_freqs += counts

final_counts = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)

In [None]:
final_counts

In [None]:
user_b_counts = final_counts

In [None]:
top_words = [i[0] for i in user_b_counts[:top_word_n]]
top_counts = [i[1] for i in user_b_counts[:top_word_n]]

sns.barplot(top_words, top_counts, palette=custom_palette)
plt.yticks([], [])
plt.xticks(fontsize=10)
plt.gcf().subplots_adjust(bottom=0.35)
plt.box(False)
plt.savefig('output_user_b.png', dpi=200, bbox_inches='tight')

### Combined

In [None]:
data['clean_message'] = data['message'].apply(
    lambda x: [word.lower() for word in x.split() if word.lower() not in stop])
data

In [None]:
word_freqs = Counter()
for words in data['clean_message']:
    counts = Counter(words)
    word_freqs += counts

final_counts = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)
total_counts = final_counts

In [None]:
top_words = [i[0] for i in total_counts[:top_word_n]]
top_counts = [i[1] for i in total_counts[:top_word_n]]

sns.barplot(top_words, top_counts, palette=custom_palette)
plt.yticks([], [])
plt.xticks(fontsize=10)
plt.gcf().subplots_adjust(bottom=0.35)
plt.box(False)
plt.savefig('output_user_combined.png', dpi=200, bbox_inches='tight')

# To Do

* Clean up pre-processing logic and remove the 'replace and reassign' step
* DRY - abstract the graph generation into a function