# Analysis and visualizations of my Facebook Messenger chat data

In [None]:
import pandas as pd
from numpy import int64, polyfit, nan
from math import floor, ceil
import matplotlib.pyplot as plt
from wordcloud import WordCloud
%matplotlib inline
my_name = 'Andreas Vija'

In [None]:
csv_msgs = pd.read_csv('data.csv').fillna('')
csv_msgs.info()
csv_msgs.head()

In [None]:
all_msgs = csv_msgs.copy()
all_msgs['time'] = csv_msgs['time'].astype('datetime64[s]')

In [None]:
df = all_msgs[['time', 'text']]
df = df.set_index('time')
df = df.resample('w').count()

df.plot();

There seems to be a large unnatural dip in early 2017 and two peaks in mid-2016 and Jan 2017

Only <5% of my messages were sent before 2015, thus leaving out 2010-2014 helps make some graphs easier to read while losing very little data

In [None]:
most_msgs = csv_msgs.copy()
_2015 = pd.to_datetime('2015-01-01 00:00:00', infer_datetime_format=True).timestamp()
most_msgs = most_msgs[most_msgs['time'] > int(_2015)]
most_msgs['time'] = most_msgs['time'].astype('datetime64[s]')

## Chats with the most messages
As evident later in certain "chat activity over time" graphs, big chunks of messages are actually missing from multiple chats, making these message counts not accurate, although they paint a fair enough picture of the most active chats

In [None]:
chats = all_msgs.copy()[['thread', 'names', 'groupchat']].groupby(['thread', 'names'], as_index=False)

types = chats.max()[['groupchat']]
counts = chats.count().rename(columns={'groupchat': 'message count'})
chats = pd.concat([types, counts], axis=1)

chats = chats.sort_values('message count', ascending=False)
chats = chats.reset_index()[['thread', 'groupchat', 'names', 'message count']]

chats[['thread', 'groupchat', 'message count']].head(20)

5 chats with exactly/almost exactly 10,000 messages is suspicious

## People with the most messages in this dataset

In [None]:
df = all_msgs.copy()[['sender', 'groupchat']]
df = df.groupby('sender').count()
df = df.rename(columns={'groupchat': 'messages_available'})
df = df.sort_values('messages_available', ascending = False)
df.head(20)

## The activity I have recorded in various chat types through time

In [None]:
df1 = most_msgs.copy()
df1 = df1.set_index('time')

df2 = df1[df1['groupchat'] == True]
df3 = df1[df1['groupchat'] == False]
df4 = df1[df1['sender'] == my_name]

df1 = df1[['text']]
df2 = df2[['text']]
df3 = df3[['text']]
df4 = df4[['text']]

df1 = df1.rename(columns={'text': 'All messages'})
df2 = df2.rename(columns={'text': 'Groupchat messages'})
df3 = df3.rename(columns={'text': 'Private messages'})
df4 = df4.rename(columns={'text': 'My sent messages'})

df1 = pd.concat([df1, df2, df3, df4])
df1 = df1.resample('M').count()

df1.plot();

* The peak in late 2015 is a new groupchat with two of my friends, the following dip and the continued low of groupchat messages is roughly 10k missing messages from the same chat and many messages missing from another groupchat.
* The peak in Jul-Aug 2016 is the groupchat of a high school summer program in USA
* The general low in Apr-Aug 2017 is roughly 10k missing messages from a single private conversation

## My activity by the time of day

In [None]:
df = all_msgs[all_msgs['sender'] == my_name]
df = df[['time', 'groupchat']]

df = df.set_index('time')
df = df.resample('5min').sum()
df.index = df.index.time
df = df.reset_index()

df = df.groupby('index')
df = df.count()
df = df.rename(columns={'groupchat': 'sent_messages'})

df.plot();

Facebook provided all messages in either UTC+02 or UTC+03. ```format.py``` saved time without timezone information and pandas by default also does not concern itself with timezones. Despite that, without shifting the time forward about 3 hours, the following graph makes no sense as it would show an initial sharp rise in activity at around 4-5 in the morning while I never wake up that early. The same applies to the following graph. 

## Messages by time of day every day

In [None]:
df = all_msgs[all_msgs['sender'] == my_name]
df = df[['time', 'groupchat']]
df = df.set_index('time')
df = df.resample('H').sum()
pivot = df.pivot_table('groupchat', index=df.index.time, columns=df.index.date)
pivot.plot(legend=False, alpha=0.10);

There does not seem to be any consistent baseline, most activity seems to come in spikes

## My activity by the day of the week

In [None]:
df1 = all_msgs[all_msgs['sender'] == my_name]
df1 = df1[['time', 'groupchat']]
df1 = df1.rename(columns={'groupchat': 'sent_messages', 'time': 'weekday'})
df1 = df1.set_index('weekday')

df2 = df1.copy()
df2['weekday_number'] = df2.index.weekday
df2 = df2[['weekday_number']]

df1.index = df1.index.weekday_name
df2.index = df2.index.weekday_name

df1 = df1.groupby('weekday').count()
df2 = df2.groupby('weekday').max()

df1 = pd.concat([df1, df2], axis=1)

df1 = df1.sort_values('weekday_number')
df1 = df1[['sent_messages']]

maximum = df1['sent_messages'].values.max()
df1.plot(kind='bar', ylim=(0, maximum + 3000));

Surprisingly, my activity is not the highest on the weekends, but on Mondays and Sundays (although the difference is not big).

## Words per message I send over time

In [None]:
df = most_msgs[most_msgs['sender'] == my_name]
df = df[['time', 'text']]
df['word_average'] = df['text'].str.strip().str.replace('  ', ' ').str.split(' ').apply(len)
df = df[['time', 'word_average']]

df = df.set_index('time')
df = df.resample('w').mean()
df = df.reset_index()

timestamps = df['time'].values.astype(int64) // 10 ** 9 #strange hack
a, b = polyfit(timestamps, df['word_average'], 1)

x = range(timestamps.min(), timestamps.max()+1, 604800) #604 800 seconds in a week
trend = pd.Series(a*x + b, name='trend')

df = pd.concat([df, trend], axis=1)
df = df.set_index('time')

maximum = df['word_average'].values.max()
df.plot(ylim=(0, maximum + 1));

## Relationship between messages in a chat and the average number of words in one message

In [None]:
df1 = all_msgs[['thread', 'text']]
df2 = df1.copy()

df1 = df1.rename(columns={'text': 'word_count'})
df1 = df1.groupby('thread').count()

df2['text'] = df2['text'].str.split(' ').apply(len)
df2 = df2.rename(columns={'text': 'word_average'})
df2 = df2.groupby('thread').mean()

df1 = pd.concat([df1, df2], axis=1)

f, axarr = plt.subplots(1, 2, figsize=(13,4))
axarr[0].scatter(x=df1['word_count'], y=df1['word_average'])
axarr[1].scatter(x=df1['word_count'], y=df1['word_average'])
axarr[1].set_yscale('log')
axarr[1].set_xscale('log')

As the amount of messages sent increases, the average words per message converges somewhere between 4 and 5.

## Most common words for all messages and for me

In [None]:
words = dict()
punctuation = ['.',',',';','!','?',')']

def read_words(message):
    global words, punctuation
    wordlist = message.lower().strip().replace('\r', ' ').replace('\n', ' ').replace('  ', ' ').split(' ')
    
    for word in wordlist:
        
        while len(word) > 1 and word[-1] in punctuation:
            word = word[: len(word) - 1]
        while len(word) > 1 and word[0] == '(':
            word = word[1:]
        
        if word in words:
            words[word] = words.get(word) + 1
        else:
            words[word] = 1

def get_word_counts(df):
    global words, punctuation
    
    df['text'].apply(read_words)
    
    df = pd.DataFrame.from_dict(words, orient='index')
    words = dict()
    
    df.columns = ['count']
    df = df.sort_values('count', ascending=False)
    
    return df

In [None]:
df1 = all_msgs[['text']]
df2 = (all_msgs[all_msgs['sender'] == my_name])[['text']]

df1 = get_word_counts(df1)
print(df1.head(20))
print()

df2 = get_word_counts(df2)
print(df2.head(20))

all_word_counts = df1.copy()

Only significant differences are my fondness of ":D" and that I send non-text data (images, stickers, etc., represented by empty string) somewhat less.

## Acitivity of the biggest chats over time

In [None]:
chats = chats.copy()

groupchats = chats[chats['groupchat'] == True]
chats = chats[chats['groupchat'] == False]
# 8 most popular groupchats and regular chats
important_chats = (chats[['thread', 'names']][:10].values.tolist() + 
                   groupchats[['thread', 'names']][:10].values.tolist())

In [None]:
queue = list() #contains tuples in the form (image or plot, title, optional plot type)

# helper function to arrange images more compactly
def display_queue_as_grid(figure_type, columns=3):
    
    rows = ceil(len(queue)/columns)
    
    for i in range(rows):
        f, axarr = plt.subplots(1, columns, figsize=(18,5))
        
        for j in range(columns):

            if len(queue) > 0:
                
                if figure_type == 'image':
                    data = queue.pop(0)
                    axarr[j].imshow(data[0], interpolation='bilinear')
                    axarr[j].set_axis_off()
                
                else:
                    data = queue.pop(0)
                    data[0].plot(kind=data[2], ax=axarr[j], title=title)
                
                axarr[j].set_title(data[1])
            
            else:
                axarr[j].set_axis_off()
        
        plt.show()

In [None]:
def create_timeseries(messages, title, people):
    
    if len(people.split(',')) < 12 : # if there is <=12 people, including me
        # area plot with each person's contribution
        df1s = []
        people = people.split(',') + [my_name]
        
        for person in people:
            
            df1 = messages[messages['sender'] == person]
            df1 = df1.set_index('time')[['groupchat']].sort_index()
            df1 = df1.rename(columns={'groupchat': person})
            df1s.append(df1)
            
        timeseries = pd.concat(df1s)
        timeseries = timeseries.resample('M').count()
        queue.append((timeseries, title, 'area'))
        
    else:
        # just a simple timeseries of all sent messages
        timeseries = messages.set_index('time')[['groupchat']].sort_index()
        timeseries = timeseries.rename(columns={'groupchat': 'Sent messages'})
        timeseries = timeseries.resample('w').count()
        queue.append((timeseries, title, 'line'))

In [None]:
for data in important_chats:
    thread = data[0]
    names = data[1]
    chat = all_msgs[all_msgs['thread'] == thread]
    chat = all_msgs[all_msgs['names'] == names]
    
    if len(thread) > 50:
        title = thread[:50] + '...'
    else:
        title = thread
    
    create_timeseries(chat, title, names)
    
display_queue_as_grid('plot')

* Missing chunks of messages can be observed in many chats here.
* Some chat data also starts later than the chats actually did.

## Average words per message by people in different chats

In [None]:
def print_average_wpm(messages, title, people):
    
    averages = dict()
    
    if len(people.split(',')) > 12 : 
        # if there is >12 people including me, skip it
        # because there are too many people to display
        return
    
    people = people.split(',') + [my_name]
        
    for person in people:
            
        df = messages[messages['sender'] == person]
        df2 = df.copy()
        df2['word_average'] = df2['text'].str.strip().str.replace('  ', ' ').str.split(' ').apply(len)
        df2 = df2[['word_average']]
        averages[person] = round(df2.mean(), 2)
    
    df = messages.copy()
    df['word_average'] = df['text'].str.strip().str.replace('  ', ' ').str.split(' ').apply(len)
    df = df[['word_average']]
    #averages['chat'] = round(df.mean(), 2)
    df = pd.DataFrame.from_dict(averages, orient='index')
    
    print('###' + title + ':' + '\n')
    print(df)
    print()

In [None]:
for data in important_chats:
    thread = data[0]
    names = data[1]
    chat = all_msgs[all_msgs['thread'] == thread]
    chat = all_msgs[all_msgs['names'] == names]
    
    if len(thread) > 50:
        title = thread[:50] + '...'
    else:
        title = thread
    
    print_average_wpm(chat, title, names)

No significant data exists for other people, but as one might expect, my wordiness seems to depend greatly on the chat

## Most characteristic words of the biggest chats
(Calculated as words occurring much more often in the chat as compared to the entire dataset)

In [None]:
important_word_counts = all_word_counts[all_word_counts['count'] >= 6]
total_important_words = all_word_counts['count'].sum()
word_frequencies = important_word_counts['count'] / total_important_words
word_frequencies = word_frequencies.rename('global_frequency')

def remove_longs(word):
    # get rid of very long words such as URLs that ruin wordcluds
    if len(word) < 18:
        return word
    else:
        return nan

def plot_most_characteristic_words(messages, title):
    df = get_word_counts(messages)
    
    df['frequency'] = df['count'] / df['count'].sum()
    df = df[df['count'] >= 6]
    
    df = pd.concat([df, word_frequencies], axis=1)
    # drop all columns where there isn't a value in 'frequency'
    df = df.dropna(axis=0, how='any')
    
    df['quotient'] = df['frequency'] / df['global_frequency']
    df['word'] = df.index.to_series().apply(remove_longs)
    df = df.dropna(axis=0, how='any')
    df = df.set_index('word')
    
    df = df.sort_values(['quotient', 'count'], ascending=[False, False])
    df = df[['quotient']].head(80)
    
    dictionary = df['quotient'].to_dict()
    wordcloud = WordCloud(height=300, width=400, max_font_size=55).generate_from_frequencies(dictionary)
    
    queue.append((wordcloud, title))

In [None]:
for data in important_chats:
    thread = data[0]
    names = data[1]
    chat = all_msgs[all_msgs['thread'] == thread]
    chat = all_msgs[all_msgs['names'] == names]
    
    if len(thread) > 50:
        title = thread[:50] + '...'
    else:
        title = thread
    
    plot_most_characteristic_words(chat, title)

display_queue_as_grid('image')

## Most characteristic words of the most occurring people

In [None]:
df = all_msgs.copy()[['sender', 'groupchat']]
df = df.groupby('sender').count()
df = df.rename(columns={'groupchat': 'messages_available'})
df = df.sort_values('messages_available', ascending = False)
most_occurring_people = df[df['messages_available'] >= 500].index.tolist()

for person in most_occurring_people[:20]:
    messages = all_msgs[all_msgs['sender'] == person]
    plot_most_characteristic_words(messages, person)

display_queue_as_grid('image')