### Modules

Make sure to download all modules first using 'conda/pip install {module_name}' in Anaconda.

In [1]:
import os
import re
import json
import nltk
import pytz
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Helper functions

In [35]:
def json_to_df(path):
    """
    Load and combine all JSON files in the inbox folder into one Pandas dataframe.
    Assumes that all JSON files in the inbox folder have similar formats.
    
    Parameters
    ---------------
    path (str): Path of inbox folder containing Facebook messages
    
    Output
    ---------------
    Pandas dataframe 
    """
    ## Initialize empty lists and tables
    folder_lst = []
    json_lst = []
    message_tbl = pd.DataFrame()
    
    ## Get all JSON files in each folder
    for folder in os.listdir(path):
        for file in os.listdir(f'{path}/{folder}'):
            if file.endswith('.json'):
                json_lst += [f'{path}/{folder}/{file}']
            
    ## Populate table
    for json_file in json_lst:
        thread = re.search('//(.+?)_', json_file).group(1)
        with open(json_file) as json_data:
             data = json.load(json_data)
        tmp_df = pd.DataFrame(data['messages'])
        tmp_df['thread'] = thread
        message_tbl = message_tbl.append(tmp_df, ignore_index = True)
        
    ## Convert timestamp to datetime
    message_tbl['date'] = pd.to_datetime(message_tbl['timestamp_ms'], unit = 'ms', utc = True)
    message_tbl['date'] = message_tbl['date'].dt.tz_convert('Asia/Singapore')
    message_tbl['year'] = message_tbl['date'].dt.year
    message_tbl = message_tbl[message_tbl['year'] == 2021] ## Only get 2021 messages. Remove if desired.
    message_tbl['hour'] = message_tbl['date'].dt.hour 
    message_tbl['date'] = message_tbl['date'].dt.date ## Get date only (from datetime)
    message_tbl = message_tbl[['thread','sender_name','content','date','hour']]
            
    return message_tbl

In [4]:
def messages_to_words(df, stopword_lst, min_freq):
    """
    Cleans and splits Facebook messages into individual words.
    
    Parameters
    ---------------
    df (Pandas dataframe) : Dataframe of all messages (output of json_to_df)
    stopword_lst (list)   : List of all stopwords (words to remove)
    min_freq (int)        : Minimum frequency of words to include in analysis
    
    Output
    ---------------
    Pandas dataframe
    """
    
    ## Convert messages to lowercase
    df['content'] = df['content'].astype(str).str.lower() 

    ## Remove all punctuations
    df['content'] = df['content'].str.replace(r'[^\w\s]+', '')

    ## Tokenize messages (into words)
    df['content_token'] = df['content'].apply(word_tokenize) 

    ## Remove stopwords
    df['content_token'] = df['content_token'].apply(lambda x: [item for item in x if item not in stopword_lst])
    
    ## Split phrases into separate words, and remove infrequent words
    df = df.explode('content_token')

    count_tbl = pd.DataFrame(df['content_token'].value_counts())
    count_tbl.reset_index(inplace = True)
    count_tbl.columns = ['content_token', 'freq']

    df = df.merge(count_tbl, on = 'content_token')
    df = df[df['freq'] >= min_freq] 
    
    return df

### Data transformation

In [111]:
inbox_dir = 'messages/inbox/' ## Change to file directory of Messenger inbox

#################################
## Compile all messages 
#################################
message_tbl = json_to_df(inbox_dir)

#################################
## Add stopwords
#################################
filipino_stopwords = pd.read_csv('stopwords_tl.csv')
filipino_stopwords = filipino_stopwords['word'].to_list()
all_stopwords = nltk.corpus.stopwords.words('english')

## Append Filipino stopwords
all_stopwords.extend(filipino_stopwords) 

## Add other stopwords
other_stopwords = ['okay', 'sige', 'ung', 'din', 'rin', 'nan',
                  'haha', 'hahaha', 'hahahaha', 'hahahahaha', 'hahahahahaha', 'oh', 'u', '1', '2', '3',
                  'poll','reacted', 'ð', 'ok', 'po'] ## Add stopwords as desired
all_stopwords.extend(other_stopwords)

#################################
## Data transformation
#################################
message_tbl = messages_to_words(message_tbl, all_stopwords, 5)

  df['content'] = df['content'].str.replace(r'[^\w\s]+', '')


### Data visualization

In [112]:
## Change to your Facebook name
my_name = 'test' 

## Indicate whether message was sent or received
message_tbl['message_status'] = np.where(message_tbl['sender_name'] == my_name, 'Sent', 'Received') 

#### Number of messages sent and received (Overall, per day)

In [None]:
#################################
## Count messages per date
#################################
date_tbl = pd.DataFrame(message_tbl.drop_duplicates(['content']).groupby('message_status')['date'].value_counts())
date_tbl = date_tbl.rename(columns = {'date':'count'}).reset_index()
date_tbl = date_tbl.sort_values(['date'])

#################################
## Data visualization
#################################
sns.set(font_scale = 1.2) 
sns.set_style("whitegrid")
plot = sns.FacetGrid(date_tbl, row = 'message_status', hue = 'message_status', height = 5, aspect = 3)
plot.map(plt.plot, 'date', 'count') 
plot.map(plt.fill_between, 'date', 'count', alpha= 0.4)
plot.set(xlabel = 'Date', ylabel = 'Number of messages')
plot.fig.subplots_adjust(top=0.89)
plt.subplots_adjust(hspace = 0.2)
plot.fig.suptitle('Number of messages sent and received in 2021')

date_format = mdates.DateFormatter("%m/%d/%y")
plot.axes[0,0].xaxis.set_major_formatter(date_format)
plot.axes[0,0].xaxis.grid(False)
plot.axes[1,0].xaxis.grid(False)

plt.show()

#### Number of messages sent and received (Overall, per hour)

In [None]:
#################################
## Count messages per hour
#################################
hour_tbl = pd.DataFrame(message_tbl.drop_duplicates(['content']).groupby('message_status')['hour'].value_counts())
hour_tbl = hour_tbl.rename(columns = {'hour':'count'}).reset_index()
hour_tbl = hour_tbl.sort_values(['hour'])
hour_tbl = hour_tbl.pivot(index = 'hour', columns = 'message_status', values = 'count')

#################################
## Data visualization
#################################

sns.set(font_scale = 1.2, 
        rc={'figure.figsize':(11,8)})
sns.set_style("whitegrid")
plot = hour_tbl.plot(kind = 'bar', stacked = 'True')
plot.xaxis.grid(False)
plot.set(xlabel = 'Hour', ylabel = 'Messages received', 
         title = 'Number of messages received and sent per hour')
plt.show()


#### Top n words (Overall)

In [None]:
n = 10

#################################
## Get top words
#################################
word_tbl = pd.DataFrame(message_tbl.groupby('message_status')['content_token'].value_counts())
top_n_words = word_tbl.rename(columns = {'content_token':'count'}).reset_index().groupby('message_status').head(n)

#################################
## Data visualization
#################################
sns.set(font_scale = 1.2) 
sns.set_style("whitegrid")
plot = sns.FacetGrid(top_n_words, col = 'message_status', hue = 'message_status', sharex = False, height = 5, aspect = 1.5)
plot.map_dataframe(sns.barplot, x = 'content_token', y = 'count') 
plot.fig.subplots_adjust(top=0.85)
plot.fig.suptitle(f'Top {n} words used in messages')
plt.show()

#### Top n people most talked to

In [None]:
n = 5

################################
## Get top people
#################################
user_tbl = pd.DataFrame(message_tbl[message_tbl['sender_name'] != my_name].drop_duplicates(['content']).value_counts('sender_name'))
user_tbl = user_tbl.rename(columns = {0:'count'}).reset_index().head(n)

################################
## Data visualization
#################################

sns.set(font_scale = 1.2, rc={'figure.figsize':(12,9)}) 
sns.set_style("whitegrid")
sns.color_palette("husl", 8)
plot = sns.barplot(x = 'count', y = 'sender_name', data = user_tbl)
plot.set(xlabel = 'Number of messages received', ylabel = 'Sender name',
        title = f'Top {n} people talked to in 2021')
plot.bar_label(plot.containers[0], padding = 12)
plt.show()

#### Number of messages received (Specific users, per day)

In [None]:
thread_lst = ['adriancortes'] ## Names of threads to include

#################################
## Count messages per date
#################################
date_tbl_per_thread = message_tbl[message_tbl['thread'].isin(thread_lst)]
date_tbl_per_thread = pd.DataFrame(date_tbl_per_thread.drop_duplicates(['content']).groupby('sender_name')['date'].value_counts())
date_tbl_per_thread = date_tbl_per_thread.rename(columns = {'date':'count'}).reset_index().sort_values(['date'])

#################################
## Data visualization
#################################
sns.set(font_scale = 1.2, 
        rc={'figure.figsize':(11,8)})
sns.set_style("whitegrid")
plot = sns.FacetGrid(date_tbl_per_thread, row = 'sender_name', hue = 'sender_name', height = 5, aspect = 3)
plot = plot.map(plt.plot, 'date', 'count') 
plot = plot.map(plt.fill_between, 'date', 'count', alpha= 0.4)

date_format = mdates.DateFormatter("%m/%d/%y")
plot.axes[0,0].xaxis.set_major_formatter(date_format)
plot.axes[0,0].xaxis.grid(False)
plot.axes[1,0].xaxis.grid(False)

plt.show()

#### Number of messages received (Specific users, per hour)

In [None]:
thread_lst = [''] ## Names of threads to include

#################################
## Count messages per date
#################################
hour_tbl_per_thread = message_tbl[message_tbl['thread'].isin(thread_lst)]
hour_tbl_per_thread = pd.DataFrame(hour_tbl_per_thread.drop_duplicates(['content']).groupby('sender_name')['hour'].value_counts())
hour_tbl_per_thread = hour_tbl_per_thread.rename(columns = {'hour':'count'}).reset_index().sort_values(['hour'])
hour_tbl_per_thread = hour_tbl_per_thread.pivot(index = 'hour', columns = 'sender_name', values = 'count')

#################################
## Data visualization
#################################
sns.set(font_scale = 1.2, 
        rc={'figure.figsize':(11,8)})
sns.set_style("whitegrid")
plot = hour_tbl_per_thread.plot(kind = 'bar', stacked = 'True')
plot.xaxis.grid(False)
plot.set(xlabel = 'Hour', ylabel = 'Messages received', 
         title = 'Number of messages sent per hour by users')
plt.show()

#### Top n words (Specific users)

In [None]:
thread_lst = [''] ## Check thread name in message_tbl
n = 10 

#################################
## Get top words
#################################
thread_tbl = pd.DataFrame(message_tbl[message_tbl['thread'].isin(thread_lst)].groupby('sender_name')['content_token'].value_counts())
top_n_words_per_thread = thread_tbl.groupby('sender_name').head(n)
top_n_words_per_thread = top_n_words_per_thread.rename(columns = {'content_token':'count'}).reset_index()

#################################
## Data visualization
#################################
sns.set(font_scale = 1.2) 
sns.set_style("whitegrid")
plot = sns.FacetGrid(top_n_words_per_thread, row = 'sender_name', hue = 'sender_name', sharex = False, height = 5, aspect = 1.5)
plot.map_dataframe(sns.barplot, x = 'content_token', y = 'count') 
plot.fig.subplots_adjust(top=0.93)
plot = plot.fig.suptitle(f'Top {n} words used by each user')
plt.show()