In [None]:
import os
import pickle

# check what data sources we have
# Working data sources:
#  - Facebook data dump in JSON format (the whole folder named 'facebook-YourName12345' in data/)
#  - Discord GDPR request data (the whole folder renamed to 'discord-YourUsername#1234' in data/)
#  - Google Takeout dump of Hangouts (the whole folder renamed to 'google-YourUsername' in data/)
dirs = os.listdir('data')

In [None]:
import json

# load in saved aliases
with open('aliases.json') as f:
    alias = json.load(f)
    
# set who "me" is for all this data
me = 'cephcyn'

# translate from datasource name to unaliased name (if available)
def un_alias(name):
    if name in alias.keys():
        return alias[name]
    else:
        return name

In [None]:
# initialize data formats
from collections import namedtuple

Message = namedtuple('Message', ['person', 'sent_by_me', 'char_count', 'timestamp'])

In [None]:
# Clear the log

log = []
with open('dumps/msgs.pickle', 'wb') as handle:
    pickle.dump(log, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# read Facebook Messenger data into custom format
import re
import json
from datetime import datetime, timezone

fb_dir_name_rx = re.compile("^facebook-.*$")
json_rx = re.compile("^.*\.json$")

def parse_msg(msg_data):
    if len(msg_data['participants']) != 2:
        return []
    
    # Sanity check that I'm in the chat
    # also get the name of the person I'm talking with
    participants = [un_alias(p['name']) for p in msg_data['participants']]
    if (participants[0] == me):
        conversant = participants[1]
    elif (participants[1] == me):
        conversant = participants[0]
    else:
        print('\'me\' not found in chat, add alias?')
        print('Participants:', msg_data['participants'])
        return []
    
    # Exclude the conversant if null (alias file says to remove them)
    if conversant is None:
        return []
    
    # Parse the chat log into a list of tuples
    log = []
    for msg in msg_data['messages']:
        # note that Facebook saves timestamp_mS not S
        msg['sender_name'] = un_alias(msg['sender_name'])
        ms = msg['timestamp_ms']
        try:
            msg_tuple = Message(
                conversant, 
                msg['sender_name'] == me, 
                len(msg['content']),
                datetime.fromtimestamp(ms//1000, tz=timezone.utc).replace(microsecond=ms%1000*1000)
            )
        except:
            # if there's no content, assume it's a sticker or photo or video or whatever
            msg_tuple = Message(
                conversant, 
                msg['sender_name'] == me, 
                1,
                datetime.fromtimestamp(ms//1000, tz=timezone.utc).replace(microsecond=ms%1000*1000)
            )
        log.append(msg_tuple)
    return log

def parse_dir(fb_dir_name, msg_type_dir):
    log = []
    for chat_name in os.listdir(f'data/{fb_dir_name}/messages/{msg_type_dir}'):
        for chat_fname in os.listdir(f'data/{fb_dir_name}/messages/{msg_type_dir}/{chat_name}'):
            if json_rx.match(chat_fname):
                with open(f'data/{fb_dir_name}/messages/{msg_type_dir}/{chat_name}/{chat_fname}') as f:
                    msg_data = json.load(f)
                    log = log + parse_msg(msg_data)
    return log

# Load already-existing logs!
with open('dumps/msgs.pickle', 'rb') as handle:
    log = pickle.load(handle)

for fb_dir_name in dirs:
    if fb_dir_name_rx.match(fb_dir_name):
        msg_dir = os.listdir(f'data/{fb_dir_name}/messages')
        for msg_type_dir in ['inbox', 'archived_threads']:
            if msg_type_dir in msg_dir:
                log = log + parse_dir(fb_dir_name, msg_type_dir)

# Save log with Facebook data to disk
with open('dumps/msgs.pickle', 'wb') as handle:
    pickle.dump(log, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# read Discord data dump (directly obtained from Discord) into custom format
# note that this parse will not give us names for people who aren't added as friends on Discord
import re
import json
import csv
from datetime import datetime

dc_dir_name_rx = re.compile('^discord-.*$')
msg_log_dir_rx = re.compile('^[0-9]+$')

def parse_msg(id_to_dc_uname, msg_log_dir):
    with open(f'data/{dc_dir_name}/messages/{msg_log_dir}/channel.json') as f:
        channel_info = json.load(f)
    
    # skip over chats that are group chats or server channels
    if not channel_info['type'] == 1:
        return []

    # Sanity check that I'm in the chat
    # also get the name of the person that I'm chatting with
    try:
        participants = [un_alias(id_to_dc_uname[p]) for p in channel_info['recipients']]
    except:
        print('at least one of the Discord DM participants is unknown (not a friend?)')
        print('Participants:', channel_info['recipients'])
        return []
    if (participants[0] == me):
        conversant = participants[1]
    elif (participants[1] == me):
        conversant = participants[0]
    else:
        print('\'me\' not found in chat, add alias?')
        print('Participants:', channel_info['recipients'])
        return []
    
    # Exclude the conversant if null (alias file says to remove them)
    if conversant is None:
        return []
    
    # Parse the chat log into a list of tuples
    log = []
    with open(f'data/{dc_dir_name}/messages/{msg_log_dir}/messages.csv') as f:
        # note that Discord data dump only includes messages that I sent, not both-party messages
        csv_reader = csv.DictReader(f)
        for row in csv_reader:
            try:
                msg_time = datetime.strptime(row['Timestamp'], '%Y-%m-%d %H:%M:%S.%f%z')
            except:
                # Discord exports data format without ms sometimes??? not sure why
                msg_time = datetime.strptime(row['Timestamp'], '%Y-%m-%d %H:%M:%S%z')
            msg_tuple = Message(
                conversant,
                True, # Discord logs only save messages sent by me
                len(row['Contents']),
                msg_time
            )
            log.append(msg_tuple)
            # For basic graph sanity's sake, assume that for each message I sent on Discord, I got one back
            # this definitely isn't accurate, but it's the best I can do with the official data dump
            msg_tuple = Message(
                conversant,
                False, # this is a BS message, we're pretending ppl always reply and same-day
                len(row['Contents']),
                msg_time
            )
            log.append(msg_tuple)
    return log

# Load already-existing logs!
with open('dumps/msgs.pickle', 'rb') as handle:
    log = pickle.load(handle)

for dc_dir_name in dirs:
    if dc_dir_name_rx.match(dc_dir_name):
        # Get the friend username data
        # maps {id: username#discriminator} i.e. {3243728943728:"Username#0001"}
        id_to_dc_uname = {}
        with open(f'data/{dc_dir_name}/account/user.json') as f:
            dc_user_data = json.load(f)
            # load in my ID
            my_username = dc_user_data['username']
            my_discriminator = dc_user_data['discriminator']
            id_to_dc_uname[dc_user_data['id']] = f'{my_username}#{my_discriminator}'
            # load in friends' IDs
            for friend in dc_user_data['relationships']:
                friend_username = friend['user']['username']
                friend_discriminator = friend['user']['discriminator']
                id_to_dc_uname[friend['id']] = f'{friend_username}#{friend_discriminator}'
        
        # Get the sent-messages data
        msg_dir = os.listdir(f'data/{dc_dir_name}/messages')
        for msg_log_dir in msg_dir:
            if msg_log_dir_rx.match(msg_log_dir):
                log = log + parse_msg(id_to_dc_uname, msg_log_dir)
        
# Save log with Discord dump data to disk
with open('dumps/msgs.pickle', 'wb') as handle:
    pickle.dump(log, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# read Discord data scrape (using DiscordChatExporter) into custom format
# TODO

In [None]:
# read Google Takeout data export of Hangouts chat logs into custom format
import re
import json
import csv
from datetime import datetime, timezone

gg_dir_name_rx = re.compile('^google-.*$')

# Load already-existing logs!
with open('dumps/msgs.pickle', 'rb') as handle:
    log = pickle.load(handle)

def parse_msg(msg_data, id_to_gg_uname):
    # TODO add support for 2-person group chat data?
    if not msg_data['conversation']['conversation']['type'] == 'STICKY_ONE_TO_ONE':
        return []
    if len(msg_data['conversation']['conversation']['participant_data']) > 2:
        return []
    
    # Sanity check that I'm in the chat
    participants = msg_data['conversation']['conversation']['participant_data']
    participants = [un_alias(id_to_gg_uname[p['id']['gaia_id']]) for p in participants]
    if (participants[0] == me):
        conversant = participants[1]
    elif (participants[1] == me):
        conversant = participants[0]
    else:
        print('\'me\' not found in chat, add alias?')
        print('Participants:', msg_data['conversation']['conversation']['participant_data'])
        return []
    
    # Exclude the conversant if null (alias file says to remove them)
    if conversant is None:
        return []
    
    # Parse the chat log into a list of tuples
    log = []
    for msg in msg_data['events']:
        msg['sender_name'] = un_alias(id_to_gg_uname[msg['sender_id']['gaia_id']])
        # note that Google saves timestamp_uS, not S
        ms = int(msg['timestamp'])
        try:
            msg_tuple = Message(
                conversant, 
                msg['sender_name'] == me, 
                sum([len(s['text']) for s in msg['chat_message']['message_content']['segment']]),
                datetime.fromtimestamp(ms//1000000, tz=timezone.utc).replace(microsecond=ms%1000000)
            )
        except:
            # if there's no content, assume it's a photo or video or Hangouts call or whatever
            msg_tuple = Message(
                conversant, 
                msg['sender_name'] == me, 
                1,
                datetime.fromtimestamp(ms//1000000, tz=timezone.utc).replace(microsecond=ms%1000000)
            )
        log.append(msg_tuple)
    return log

for gg_dir_name in dirs:
    if gg_dir_name_rx.match(gg_dir_name):
        with open(f'data/{gg_dir_name}/Hangouts/Hangouts.json') as f:
            gg_user_data = json.load(f)
        
        # Build up the mapping from gaia_id->name
        id_to_gg_uname = {}
        for i in range(len(gg_user_data['conversations'])):
            msg_data = gg_user_data['conversations'][i]
            participants = msg_data['conversation']['conversation']['participant_data']
            # TODO double check if we should use chat_id or gaia_id?
            for p in participants:
                # only add a mapping if we do not have a proper mapping yet
                if (p['id']['gaia_id'] not in id_to_gg_uname.keys()) \
                        or (id_to_gg_uname[p['id']['gaia_id']] is None):
                    if 'fallback_name' in p.keys():
                        id_to_gg_uname[p['id']['gaia_id']] = p['fallback_name']
                    else:
                        id_to_gg_uname[p['id']['gaia_id']] = None
        
        # Parse messages
        for i in range(len(gg_user_data['conversations'])):
            log = log + parse_msg(gg_user_data['conversations'][i], id_to_gg_uname)
        
# # Save log with Google dump data to disk
with open('dumps/msgs.pickle', 'wb') as handle:
    pickle.dump(log, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle

# Load the data!
with open('dumps/msgs.pickle', 'rb') as handle:
    messages = pickle.load(handle)
print("Total Messages:", len(messages))

In [None]:
# See the list of people included
# print(set([m[0] for m in messages]))

In [None]:
from datetime import datetime, timedelta
import pytz

# Set bounds for data structures for the graphs
start_date = datetime.strptime('4/01/20 +00:00', "%m/%d/%y %z")
end_date = datetime.strptime('4/21/20 +00:00', "%m/%d/%y %z")
n_days = (end_date - start_date).days
n_lengths = 2000

# Set the granularity for time-based graphs
delta_timechunks = timedelta(minutes=5)
n_timechunks = timedelta(days=1) // delta_timechunks

# Set your local timezone
local_tz = pytz.timezone('Etc/GMT-7')

In [None]:
from datetime import datetime
import numpy as np
from collections import OrderedDict
import pandas as pd

person_start_date = {} # person_start_date[name] = datetime of first contact
msgs_by_day = {} # msgs_by_day[name] = [] * n_days, populated with message counts
msgs_by_time = {} # msgs_by_time[name] = [] * n_timechunks, populated with message counts
msgs_by_length = {} # msgs_by_length[name] = [] * n_lengths, populated with message counts

for msg in messages:
    # fill in person_start_date
    if msg.person in person_start_date.keys():
        if msg.timestamp < person_start_date[msg.person]:
            person_start_date[msg.person] = msg.timestamp
    else:
        person_start_date[msg.person] = msg.timestamp
    # fill in msgs_by_day
    if msg.person in msgs_by_day.keys():
        if start_date < msg.timestamp < end_date:
                idx = (msg.timestamp - start_date).days
                msgs_by_day[msg.person][idx] += 1
    else:
        msgs_by_day[msg.person] = [0] * n_days
    # fill in msgs_by_time
    if msg.person in msgs_by_time.keys():
        if start_date < msg.timestamp < end_date:
                localtime = msg.timestamp.astimezone()
                idx = (localtime - localtime.replace(hour=0, minute=0, second=0, microsecond=0)) // delta_timechunks
                msgs_by_time[msg.person][idx] += 1
    else:
        msgs_by_time[msg.person] = [0] * n_timechunks
    # fill in msgs_by_length
    if msg.person in msgs_by_length.keys():
        if start_date < msg.timestamp < end_date:
            msgs_by_length[msg.person][msg.char_count] += 1
    else:
        msgs_by_length[msg.person] = [0] * n_lengths
        
msgs_by_day = OrderedDict(sorted(msgs_by_day.items(), key=lambda i: -sum(i[1])))
msgs_by_time = OrderedDict(sorted(msgs_by_time.items(), key=lambda i: -sum(i[1])))
msgs_by_length = OrderedDict(sorted(msgs_by_length.items(), key=lambda i: -sum(i[1])))

msg_count_data = pd.DataFrame(msgs_by_day, index=pd.date_range(start_date, periods=n_days))
msg_time_data = pd.DataFrame(msgs_by_time, index=pd.timedelta_range(start='0', periods=n_timechunks, freq=delta_timechunks))
msg_length_data = pd.DataFrame(msgs_by_length)

In [None]:
for key, value in sorted(person_start_date.items(), key=lambda i: i[1]):
    print(key, '\t: ', value)

In [None]:
# dropout any people we don't want to include in the results
# del msg_count_data['SomeName']
# del msg_time_data['SomeName']

In [None]:
from matplotlib import pyplot as plt

def numTalkedToPlot(data, min_messages=1, rolling_window=1):
    talkedTo = data[ min_messages < data ]
    talkedTo = ~pd.isnull(talkedTo)
    toPlot = talkedTo.iloc[:,:].sum(axis=1)
    return toPlot.rolling(rolling_window).mean()

plt.clf()
toPlot = numTalkedToPlot(msg_count_data, rolling_window=1)
ax = toPlot.plot(title="Number of People Talked to", figsize=(10,3))
ax.set_ylabel("Number of People")
fig = ax.get_figure()

In [None]:
plt.clf()
toPlot = numTalkedToPlot(msg_time_data, rolling_window=1)
ax = toPlot.plot(title="Number of People ever Talked To At Time", figsize=(10,3))
ax.set_ylabel("Number of People")
fig = ax.get_figure()

In [None]:
from matplotlib import pyplot as plt

def everyonePlot(data, rolling_window=1):
    sum_data = data.iloc[:,:].sum(axis=1)
    return sum_data.rolling(rolling_window).mean()

plt.clf()
toPlot = everyonePlot(msg_count_data, rolling_window=1)
ax = toPlot.plot(title="Message Count Data", figsize=(10,3))
ax.set_ylabel("Number of Messages")
ax.set_xlabel("Date")
ax.set_ylim((0, toPlot.max()*1.1))
fig = ax.get_figure()

In [None]:
plt.clf()
toPlot = everyonePlot(msg_time_data, rolling_window=1)
ax = toPlot.plot(title="Message Timestamp Data", figsize=(10,3))
ax.set_ylabel("Number of Messages")
ax.set_xlabel("Time")
ax.set_ylim((0, toPlot.max()*1.1))
fig = ax.get_figure()

In [None]:
plt.clf()
toPlot = everyonePlot(msg_length_data, rolling_window=1)
ax = toPlot.plot(title="Message Length Data", figsize=(10,3))
ax.set_ylabel("Number of Messages")
ax.set_xlabel("Time")
ax.set_ylim((0, toPlot.max()*1.1))
fig = ax.get_figure()

In [None]:
def cumMsgPlot(data, start, end):
    data = data.copy().cumsum(axis=0)
    data = data.iloc[:, start:end]
    return data

plt.clf()
top_n = 6
toPlot = cumMsgPlot(msg_count_data, start=0, end=top_n)
ax = toPlot.plot(title=f"Cumulative Messaging Data (Top {top_n} Most Talked To)", legend=True, figsize=(10,7))
ax.set_ylabel("Cumulative Number of Messages")
ax.set_xlabel("Date")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = ax.get_figure()

In [None]:
def avgMsgPlot(data, start, end, rolling_window=1):
#     data = data.copy().cumsum(axis=0)
    data = data.iloc[:, start:end]
    return data.rolling(rolling_window).mean()

plt.clf()
top_n = 6
toPlot = avgMsgPlot(msg_count_data, start=0, end=top_n, rolling_window=30)
ax = toPlot.plot(title=f"Active Messaging Data (Top {top_n} Most Talked To)", legend=True, figsize=(10,7))
ax.set_ylabel("Avg Number of Messages per Day")
ax.set_xlabel("Date")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = ax.get_figure()

In [None]:
def indivMsgPlotTopN(data, start, end, rolling_window=12):
    data = data.iloc[:, start:end]
    return data.rolling(rolling_window).mean()

plt.clf()
top_n = 6
toPlot = indivMsgPlotTopN(msg_time_data, start=0, end=top_n)
ax = toPlot.plot(title=f"Message Time Data (Top {top_n} Most Talked To)", legend=True, figsize=(10,7))
ax.set_ylabel("Cumulative Number of Messages")
ax.set_xlabel("Time")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = ax.get_figure()

In [None]:
def indivMsgPlot(data, select, rolling_window=12):
    select = [data.columns.get_loc(x) for x in select]
    data = data.iloc[:, select]
    return data.rolling(rolling_window).mean()

plt.clf()
select = ['APerson', 'BPerson', 'CPerson', 'DPerson']
toPlot = indivMsgPlot(msg_time_data, select=select)
ax = toPlot.plot(title=f"Message Time Data", legend=True, figsize=(10,7))
ax.set_ylabel("Cumulative Number of Messages")
ax.set_xlabel("Time")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = ax.get_figure()

In [None]:
def indivMsgLengthPlot(data, select, rolling_window=1):
    select = [data.columns.get_loc(x) for x in select]
    data = data.iloc[:, select]
    data = data.iloc[:data.max().max(), :]
    return data.rolling(rolling_window).mean()

plt.clf()
select = ['APerson', 'BPerson', 'CPerson', 'DPerson']
toPlot = indivMsgLengthPlot(msg_length_data, select=select, rolling_window=10)
ax = toPlot.plot(title=f"Message Length Data", legend=True, figsize=(10,7))
ax.set_ylabel("Cumulative Number of Messages")
ax.set_xlabel("Message Length")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = ax.get_figure()

In [None]:
def normalizedIndivMsgPlot(data, select, rolling_window=12):
    select = [data.columns.get_loc(x) for x in select]
    data = data.iloc[:, select]
    data = data.rolling(rolling_window).mean()
    data = data.apply(lambda x: x/x.sum(), axis=0)
    return data

plt.clf()
select = ['APerson', 'BPerson', 'CPerson', 'DPerson']
toPlot = normalizedIndivMsgPlot(msg_time_data, select=select)
ax = toPlot.plot(title=f"Message Time Data (Normalized)", legend=True, figsize=(10,7))
ax.set_ylabel("Message Count (Normalized)")
ax.set_xlabel("Time")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = ax.get_figure()

In [None]:
messages_in_range = filter(lambda x: start_date < x.timestamp < end_date, messages)

df = pd.DataFrame(list(messages_in_range), columns=Message._fields)
df = df[["person", "sent_by_me"]]
df.columns = ["person", "sent"]

df['received'] = pd.Series(~df["sent"], index=df.index)
df['total'] = df['sent'] | df['received']

grouped = df.groupby('person')
sent_received = grouped.sum().sort_values('total', ascending=False)
toPlot = sent_received[["sent", "received"]].ix[:15, :]

ax = toPlot.plot.bar(title="Total Messages Sent/Received (Top 15 Most Talked To)", stacked=True, color=('b', 'r'), figsize=(10, 5))
ax.set_ylabel("Number of Messages")
ax.set_xlabel("Person")

fig = ax.get_figure()