In [None]:
import os
import pickle

# check what data sources we have
# Working data sources:
#  - Facebook data dump in JSON format (the whole folder named 'facebook-YourName12345' in data/)
#  - Discord GDPR request data (TODO implement this!!!!!)
dirs = os.listdir('data')

In [None]:
import json

# load in saved aliases
with open('aliases.json') as f:
    alias = json.load(f)
    
# set who "me" is for all this data
me = 'cephcyn'

# translate from datasource name to unaliased name (if available)
def un_alias(name):
    if name in alias.keys():
        return alias[name]
    else:
        return name

In [None]:
# initialize data formats
from collections import namedtuple

Message = namedtuple('Message', ['person', 'sent_by_me', 'char_count', 'timestamp'])

In [None]:
# read Facebook Messenger data into custom format
import re
import json
from datetime import datetime

fb_dir_name_rx = re.compile("^facebook-.*$")
json_rx = re.compile("^.*\.json$")

def parse_msg(msg_data):
    if len(msg_data['participants']) != 2:
        # TODO add support for group chat data someday?
        return []
    
    # Sanity check that I'm in the chat
    # also get the name of the person I'm talking with
    participants = [un_alias(p['name']) for p in msg_data['participants']]
    if (participants[0] == me):
        conversant = participants[1]
    elif (participants[1] == me):
        conversant = participants[0]
    else:
        print('\'me\' not found in chat, add alias?')
        print('Participants:', msg_data['participants'])
        return []
    
    # Exclude the conversant if null (alias file says to remove them)
    if conversant is None:
        return []
    
    # Parse the chat log into a list of tuples
    log = []
    for msg in msg_data['messages']:
        # note that Facebook saves timestamp_MS not S
        msg['sender_name'] = un_alias(msg['sender_name'])
        ms = msg['timestamp_ms']
        try:
            msg_tuple = Message(
                conversant, 
                msg['sender_name'] == me, 
                len(msg['content']),
                datetime.utcfromtimestamp(ms//1000).replace(microsecond=ms%1000*1000))
        except:
            # if there's no content, assume it's a sticker or photo or video or whatever
            msg_tuple = Message(
                conversant, 
                msg['sender_name'] == me, 
                1,
                datetime.utcfromtimestamp(ms//1000).replace(microsecond=ms%1000*1000))
        log.append(msg_tuple)
    return log

def parse_dir(fb_dir_name, msg_type_dir):
    log = []
    for chat_name in os.listdir(f'data/{fb_dir_name}/messages/{msg_type_dir}'):
        for chat_fname in os.listdir(f'data/{fb_dir_name}/messages/{msg_type_dir}/{chat_name}'):
            if json_rx.match(chat_fname):
                with open(f'data/{fb_dir_name}/messages/{msg_type_dir}/{chat_name}/{chat_fname}') as f:
                    msg_data = json.load(f)
                    log = log + parse_msg(msg_data)
    return log

log = []

for fb_dir_name in dirs:
    if fb_dir_name_rx.match(fb_dir_name):
        msg_dir = os.listdir(f'data/{fb_dir_name}/messages')
        for msg_type_dir in ['inbox', 'archived_threads']:
            if msg_type_dir in msg_dir:
                log = log + parse_dir(fb_dir_name, msg_type_dir)

# Save log with Facebook data to disk
with open('dumps/msgs.pickle', 'wb') as handle:
    pickle.dump(log, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# read Discord data into custom format
# TODO

In [None]:
import pickle

# Load the data!
with open('dumps/msgs.pickle', 'rb') as handle:
    messages = pickle.load(handle)
print("Total Messages:", len(messages))

In [None]:
from datetime import datetime
import numpy as np

msgs_by_day = {} # msgs[name] = [] * n_days, populated with message counts

start_date = datetime.strptime('1/1/13', "%m/%d/%y")
end_date = datetime.strptime('06/15/20', "%m/%d/%y")
delta = (end_date - start_date).days

for msg in messages:
    if msg.person in msgs_by_day.keys():
        if start_date < msg.timestamp < end_date:
                idx = (msg.timestamp - start_date).days
                msgs_by_day[msg.person][idx] += 1
    else:
        msgs_by_day[msg.person] = [0] * delta

In [None]:
from collections import OrderedDict
import pandas as pd

msgs_by_day = OrderedDict(sorted(msgs_by_day.items(), key=lambda i: -sum(i[1])))
data = pd.DataFrame(msgs_by_day, index=pd.date_range(start_date, periods=delta))

In [None]:
from matplotlib import pyplot as plt

def numTalkedToPlot(data, min_messages=1, rolling_window=1):
    talkedTo = data[ min_messages < data ]
    talkedTo = ~pd.isnull(talkedTo)
    toPlot = talkedTo.iloc[:,:].sum(axis=1)
    return toPlot.rolling(rolling_window).mean()

plt.clf()
toPlot = numTalkedToPlot(data)
ax = toPlot.plot(title="Number of People Talked to", figsize=(10,3))
ax.set_ylabel("Number of People")
fig = ax.get_figure()

In [None]:
from matplotlib import pyplot as plt

def everyonePlot(data, rolling_window=1):
    sum_data = data.iloc[:,:].sum(axis=1)
    return sum_data.rolling(rolling_window).mean()

plt.clf()
toPlot = everyonePlot(data, rolling_window=1)
ax = toPlot.plot(title="Messaging Data")
ax.set_ylabel("Number of Messages")
ax.set_xlabel("Date")
ax.set_ylim((0, toPlot.max()*1.1))
fig = ax.get_figure()

In [None]:
def cumMsgPlot(data, start, end):
    data = data.copy().cumsum(axis=0)
    data = data.iloc[:, start:end]
    return data

plt.clf()
toPlot = cumMsgPlot(data, start=0, end=6)
ax = toPlot.plot(title="Cumulative Messaging Data (Top 6 Most Talked To)", legend=True, figsize=(10,7))
ax.set_ylabel("Cumulative Number of Messages")
ax.set_xlabel("Date")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = ax.get_figure()

In [None]:
messages_in_range = filter(lambda x: start_date < x.timestamp.replace(tzinfo=None) < end_date, messages)

df = pd.DataFrame(list(messages_in_range), columns=Message._fields)
df = df[["person", "sent_by_me"]]
df.columns = ["person", "sent"]

df['received'] = pd.Series(~df["sent"], index=df.index)
df['total'] = df['sent'] | df['received']

grouped = df.groupby('person')
sent_received = grouped.sum().sort_values('total', ascending=False)
toPlot = sent_received[["sent", "received"]].ix[:15, :]

ax = toPlot.plot.bar(title="Total Messages Sent/Received (Top 15 Most Talked To)", stacked=True, color=('b', 'r'), figsize=(10, 5))
ax.set_ylabel("Number of Messages")
ax.set_xlabel("Person")

fig = ax.get_figure()