In [None]:
import pandas as pd
import re
import seaborn as sns
import operator

from datetime import datetime, date, time
from dateutil.parser import parse
from emoji import UNICODE_EMOJI

%matplotlib inline

pd.options.mode.chained_assignment = None

#so every plotting made in pandas is done in the style of seaborn.
sns.set()

# Small WhatsApp data analysis notebook


This notebook provide the code and functions you can use to make some visualisation and statistics out of the exported discussion. The tools developped work on single discussions as well as group discussions. 

## HOW TO USE

Before starting anything, you will need to export your whatsapp discussion to a .txt file.
To do so, go to a whatsapp discussion of your choice, tap on the name of the group/person to access the metadata menu, scroll all the way to the bottom where you should see an "export discussion" button. Click on it, be careful to choose __NOT__ to include medias files, then wait. It will produce a .txt file containing the whole discussion. Find a way to retrieve this file on your computer, then move it within the folder containing this notebook.

Then make sure to run ALL the cells of this notebook. Take the time to read each cell of text, and finally go to the last cell in which you will type the few commands needed in order to load the data. Once this is done, add as much cells as you want below the last one and call the functions you want to use. You are not obligated to read the code, but please take the time to read the comments at the beginning of each function so you understand how to use them and what purposes they serve.

## Preprocessing functions

Functions which are used to preprocess the data into a more easily computable for statistics format. Please, don't touch anything.

In [None]:
ios = False

#structure of the message beginning
begin_msg_appareance = '[DD.MM.YY HH:MM:SS] '
begin_msg_reg = r'\[[0-9][0-9].[0-9][0-9].[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9]\] '
regex = r'\[(\w.*)\] (\w.*?): (.*)'
alt_reg = r'\[(\w.*)\] (.*)?'
date_format = '%d.%m.%y %H:%M:%S'

if not ios:
    begin_msg_appareance = 'DD.MM.YY à HH:MM - '
    begin_msg_reg = r'[0-9][0-9].[0-9][0-9].[0-9][0-9] à [0-9][0-9]:[0-9][0-9] \- '
    regex = r'([\w\.]* à [\w\:]*) \- ([\w\+\ ]*): (.*)'
    alt_reg = r'(\w.*) \- (.*)?'
    date_format = '%d.%m.%y à %H:%M'

In [None]:


def flatten_text(lines):
    res = []
    for l in lines:
        #weird unicode character appearing whenever there is an "image absente" string in the text, apparently triggers a newline.
        msg = l.replace('\u200e', '')
        date_zone = msg[:len(begin_msg_appareance)]
        if re.match(begin_msg_reg, date_zone) is None:
            if len(res) > 0:
                res[-1] = res[-1].strip() + " " + msg 
            
        else:
            res.append(msg)
    return res
        

def extract_infos(text):
    if text is not '': 
        match = re.match(regex, text)
        try:
            result = [x for x in match.groups() if x and x!=text]
            return result
        except AttributeError:
            #in group system message, we need to match otherwise as there is no <name>:<msg>, but <name with action> 
            
            match = re.match(alt_reg, text)
            try:
                result = [x for x in match.groups() if x and x != text]
                return [result[0], 'System', '']
            except AttributeError:
                print('String causing the error : %s'%(text))
                return ['','','']
            
            
def wa_data_proc(filename):
    '''Parse and clean the whatsapp data (without media files) 
       into a structured panda dataframe
        param : filename => relative file path of the 
                .txt file containing the messages log
    '''
    with open(filename, 'r') as f:
        mainDf = pd.DataFrame([extract_infos(x) for x in flatten_text(f.readlines())], columns=['date', 'from', 'msg'])
        date_format = '%d.%m.%y %H:%M:%S'
        mainDf['date'] = mainDf.date.apply(lambda t: datetime.strptime(t, date_format))
        mainDf['Year'] = mainDf.date.apply(lambda x: x.year)
        mainDf['Month'] = mainDf.date.apply(lambda x: x.month)
        mainDf['Day'] = mainDf.date.apply(lambda x: x.day)
        return mainDf
    
def change_name(data, old_name, new_name):
    '''Allow to change the name of one contact in the
       conversation.
    '''
    data['from'] = data['from'].apply(lambda name: new_name if (name == old_name) else name )
    return data



### Plotting functions

Functions which will visualise things like the distribution of message through time, or the most talkative person in a discussion. All of those functions need to have the dataframe of the discussion given as argument (the *data* argument you can see on each function's definition) 


In [None]:
months = ['Jan', 'Fev', 'Mar', 'Avr', 'Mai', 'Jun', 'Jul', 'Aou', 'Sep', 'Oct', 'Nov', 'Dec']

def plot_msg_in_years(data, year_begin=None, year_end=None, years_cutout=[], plot_kind='line'):
    '''
    Probably the most fun plotting function, will plot the distribution of number of message per 
    month through the year, starting at year "year_begin" and ending at year "year_end" included. 
    
    
    ______
    Params
    ______
    data: panda dataframe correctly parsed from a wahtsapp .txt discussion export.
    
    year_begin: (int) The first year starting in the plot. Will default to the earliest
                year of the discussion if no input given.
                
    year_end: (int) The last year to plot. Will default to the most recent year of the
              discussion if no input given.
              
    years_cutout: (list(int)) Some discussions span multiple years, and 
                  it might be hard to correctly visualise only one year of message distribution. 
                  This list will indicate at which years to split the plots. Default to no cutout.
                  If the cutout years were 2012 and 2017, the function will generate a first plot
                  displaying the messages distribution from year_begin to 2012, then a plot from 
                  2012 to 2017, and finally a last plot from 2017 to 2019. If no cutout is provided,
                  Only one plot will be produced containing all the years.
    
    plot_kind: How to plot the message distribution. This parameter will be given directly
               as is is to the pandas "plot" function's parameter "kind". Default to line 
               (as it is the more easier to see). 'bar', 'scatter' and other value produce 
               different visual results.
    
    '''
    if year_begin is None:
        year_begin = data['Year'].min()
    if year_end is None:
        year_end = data['Year'].max()
        
    for y in years_cutout:
        if y > year_end or y < year_begin:
            raise Exception('The cutout years were not all contained between the starting and ending year.')
        
    count_by_month_and_year = data.groupby(by=['Year', 'Month']).agg('count').unstack().drop(columns=['msg', 'from', 'Day']).T
    count_by_month_and_year.index = count_by_month_and_year.index.droplevel()
    year_c_sort = sorted(years_cutout)
    for y1, y2 in zip([year_begin] + year_c_sort, year_c_sort + [year_end+1]):
        plot_title = 'Message count by month '
        plot_title += 'in %d'%(y1) if y1 == y2 else 'from %d to %d'%(y1, y2-1)
        #for some reasons, the dataframe gets duplicated on the y axis, so iloc assures we only get 12 data point per year.
        count_by_month_and_year.iloc[0:len(months)][[i for i in range(y1, y2)]].plot(title=plot_title,grid=True\
                                                            ,figsize=(15,8), xticks=range(1,13), kind=plot_kind).set_xticklabels(months)

def get_most_msg_in_day(data):
    '''
    Return the day durign which the most message was exchanged. Does not discriminate according to 
    who send the messages.
    '''
    res = data.groupby(by=['Year', 'Month', 'Day']).agg('count').idxmax()['date']
    print('Day with the most messages sent : %s'%str(res))
    #returning a dict with the year, month and day of this special day, could be useful for some people/computations.
    return {'year':res[0],'month':res[1],'day':res[2]}
    
    
def plot_msg_in_month(data, year, month, plot_kind='line'):
    '''
    Plot the distribution of message per day through a month given as argument. 
    Both the year and the month need to be int. 
    Can be usefule to have a more fine-grained analysis of the message distribution
    than the one provided by plot_msg_in_years
    '''
    
    if year > data['Year'].max() or year < data['Year'].min():
        raise Exception('please provide a year argument for which there was some discussion')
    if month > 12 or month < 1:
        raise Exception('please provide a month argument that is contained between 1 and 12')
    title = "Message number distribution per day on the Month of %s %d"%(months[month-1], year)
    data[(data.Year == year) & (data.Month == month)].groupby('Day').agg('count').plot(kind=plot_kind, figsize=(15,8), legend=False, title=title)
    
def plot_str_occurence(data, string, year_begin=None, year_end=None, years_cutout=[], plot_kind='line'):
    '''
    Will plot the occurrence of the string given as first argument through time in the same fashion
    as plot_msg_in_years (most of the args of this function are directly passed to plot_msg_in_years btw)
    '''
    count_str_occurence = lambda msg: msg.lower().count(string.lower())
    data['str_occ_count'] = data.msg.apply(count_str_occurence)
    print('The following plot is about the distribution of the following string\'s occurence: %s'%(string))
    plot_msg_in_years(data[data.str_occ_count > 0], year_begin, year_end, years_cutout, plot_kind)


def plot_most_talking_person(data, plot_kind='bar', order_by_number_of_messages=False, plot_by_year=False):
    '''
    Will plot the number of messages produce by each member of a discussion. The default plotting style
    is a bar chart as it is the best one for this job IMO.
    '''
    
    if plot_by_year:
        group_by_person = data.groupby(by=['from', 'Year']).agg('count').filter(['from','msg', 'Year']).unstack(fill_value=0)
        group_by_person.columns = group_by_person.columns.droplevel(0)
        group_by_person['msg'] = group_by_person.sum(axis=1)
    else:  
        group_by_person = data.groupby(by=['from']).agg('count').filter(['from','msg'])
    
    if order_by_number_of_messages: 
        group_by_person = group_by_person.sort_values(by="msg")
        
    if plot_by_year:
        group_by_person = group_by_person.drop(['msg'], axis=1)
    
    group_by_person.plot( kind=plot_kind, legend=plot_by_year, title='Number of msg by people in the discussion', figsize=(15,8), stacked=True)
    
def plot_nb_word_by_person(data, plot_kind='bar', order_by_number_of_messages=False,  plot_by_year=False):
    '''
    Will plot the number of characters produced by each member of a discussion. The default plotting style
    is a bar chart as it is the best one for this job IMO.
    '''
    
    data["nb_word"] = data["msg"].apply(lambda m : len(m.split()))

    if plot_by_year:
        group_by_person = data.filter(['from','nb_word', 'Year']).groupby(by=['from', 'Year']).agg('sum').unstack(fill_value=0)
        group_by_person.columns = group_by_person.columns.droplevel(0)
        group_by_person['nb_word'] = group_by_person.sum(axis=1)
    else:  
        group_by_person = data.filter(['from','nb_word']).groupby(by=['from']).agg('sum')
    
    if order_by_number_of_messages: 
        group_by_person = group_by_person.sort_values(by="nb_word")
        
    if plot_by_year:
        group_by_person = group_by_person.drop(['nb_word'], axis=1)
    
    group_by_person.plot(kind=plot_kind, legend=plot_by_year, title='Number of words by people in the discussion', figsize=(15,8),stacked=True)
    



### Emoji statistics functions



In [None]:
#small one line function to count the number of emoji in a text extract.
count_emoji = lambda text: len([c for c in text if c in UNICODE_EMOJI])

def plot_emoji_per_msg_ratio(data):
    '''
    Plot the average percentage of emoji contained in the messages per person.
    '''
    data['emoji_count'] = data.msg.apply(count_emoji)
    emoji_and_msg_count = data.drop(columns=['date','Year', 'Month', 'Day']).groupby('from')\
    .agg({'msg':'count', 'emoji_count': 'sum'}).reset_index().rename(columns={'msg':'msg_count'})
    emoji_and_msg_count['ratio_emoji_per_msg'] = emoji_and_msg_count.apply(lambda r: r['emoji_count']/r['msg_count'], axis=1)
    title = 'Average Ratio of emoji in messages'
    emoji_and_msg_count[emoji_and_msg_count['ratio_emoji_per_msg'] > 0].set_index('from').drop(columns=['msg_count', 'emoji_count']).plot(kind='bar', title=title)


def plot_msg_with_emoji_ratio(data):
    '''
    Plot the ratio of message containing emoji
    '''
    data['emoji_count'] = data.msg.apply(lambda x: 1 if count_emoji(x) > 0 else 0)
    emoji_and_msg_count = data.drop(columns=['date','Year', 'Month', 'Day']).groupby('from')\
    .agg({'msg':'count', 'emoji_count': 'sum'}).reset_index().rename(columns={'msg':'msg_count'})
    title = 'Ratio of message containing emoji'
    emoji_and_msg_count['msg_with_emoji_ratio'] = emoji_and_msg_count.apply(lambda r: r['emoji_count']/r['msg_count'], axis=1)
    emoji_and_msg_count[emoji_and_msg_count['msg_with_emoji_ratio'] > 0].set_index('from').drop(columns=['msg_count', 'emoji_count']).plot(kind='bar', title=title)

def emoji_and_numb(data):
    '''
    Return a dataframe that list for each person in the discussion the number of messages sent overall,
    And the number of emojis used in absolute.
    '''
    data['emoji_count'] = data.msg.apply(count_emoji)
    return data.drop(columns=['date','Year', 'Month', 'Day']).groupby('from').agg({'msg':'count', 'emoji_count': 'sum'}).reset_index().rename(columns={'msg':'msg_count'})

def generate_empty_emoji_dict():
    #helper function for the one below. 
    index = UNICODE_EMOJI.keys()
    return dict(zip(index, [0] * len(index)))

def top_n_emojis_used(data, n):
    '''
    Generate the ranking of the n most used emoji of each person in a discussion.
    Much less clean and safe than the other functions made, I'm sorry, I was too
    lazy to figure out the core functionnalities of pandas to use in order to obt
    ain the same result with only 2-3 lines of code, but eh, who cares as long
    as it works ?
    ______
    Params
    ______
    data : panda dataframe correctly parsed from a wahtsapp .txt discussion export.
    n : the size of the ranking of the most emoji used.
    '''
    users = list(data['from'].unique())
    dict_of_emoji_use_pp = dict([(u, generate_empty_emoji_dict()) for u in users])
    for index, row in data.iterrows():
        usr = row['from']
        for c in row['msg']:
            if c in UNICODE_EMOJI:
                dict_of_emoji_use_pp[usr][c] += 1

    newDict = {}            
    for ke,va in dict_of_emoji_use_pp.items():
        res = list(va.items()).copy()
        res.sort(key=operator.itemgetter(1))
        newDict[ke] = res[-n:]

    for k, v in newDict.items():
        if v[n-1][1] > 0:
            ranking = list(reversed(v))
            print('Top five emoji of %s:'%(k))
            for idx in range(n):
                print('\t%d: %s, count = %d'%(idx+1, ranking[idx][0], ranking[idx][1]))
            print('')

## Your turn

Now simply replace the empty string of the righ hand side of the first assignment below with the filename of your discussion .txt export, and then run the cell, and there you go sunshine, the data is loaded.

In [None]:
discussion_filename = '' #place the filename within the two airquotes and run the cell.
my_dataframe = wa_data_proc(discussion_filename)

In [None]:
#Now you can call any of the function from the previous cells here with your "my_dataframe" 
#as the argument correspoinding to "data", try it !

plot_msg_in_years(my_dataframe)