## Python statistics essential training - 03_08_email

Standard imports

In [1]:
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
import matplotlib
import matplotlib.pyplot as pp

import pandas.plotting

from IPython import display
from ipywidgets import interact, widgets

%matplotlib inline

In [3]:
# ability to get all of your google email from: https://takeout.google.com/settings/takeout
# static download from google via csv file
import re
import mailbox
import csv

### How I converted my mailbox.

In [4]:
mbox = mailbox.mbox('Sent.mbox')

The resulting object is array-like, with one entry per message. Each entry is dictionary like, with keys corresponding to metadata and data for each message.

In [5]:
mbox[0].keys()

KeyError: 'No message with key: 0'

The easiest way to get these data into Pandas is to build a CSV file from them. We use the module `csv` to write out the CSV file as we loop over the mailbox object. We save only subject, from, to, and date, and we write a simple header at the top with the names of columns.

In [None]:
with open('mbox.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['subject','from','to','date'])
    
    for message in mbox:
        writer.writerow([message['subject'], message['from'], message['to'], message['date']])

All done! Thanks to Justin Ellis for inspiration with https://jellis18.github.io/post/2018-01-17-mail-analysis.

## Moving on!

In [None]:
messages = pd.read_csv('mbox-anonymized.csv')

In [None]:
messages.info()

In [None]:
messages.head()

In [None]:
messages['from'][0]

In [None]:
re.search('<(.+)>',messages['from'][0])

In [None]:
re.search('<(.+)>',messages['from'][0]).group(0)

In [None]:
re.search('<(.+)>',messages['from'][0]).group(1)

In [None]:
re.search('<(.+)>','Michele.Vallisneri@jpl.nasa.gov').group(1)

In [None]:
def clean_address(raw):
    match = re.search('<(.+)>',raw)
    
    if match is None:
        return raw
    else:
        return match.group(1)

In [None]:
clean_address(messages['from'][0])

In [None]:
messages['from'] = messages['from'].apply(clean_address)

In [None]:
%debug

In [None]:
messages['from'] = messages['from'].dropna().apply(clean_address)
messages['to'] = messages['to'].dropna().apply(clean_address)

In [None]:
messages.head()

In [None]:
messages['date'][0]

In [None]:
pd.to_datetime(messages['date'][0]).tz_localize('UTC').tz_convert('America/Los_Angeles')

In [None]:
messages['date'] = messages['date'].apply(lambda s: pd.to_datetime(s).tz_localize('UTC').tz_convert('America/Los_Angeles'))

In [None]:
messages.date.head()

In [None]:
messages.date.min(), messages.date.max()

In [None]:
messages.date.dt.weekday_name.head()

In [None]:
messages['dayofweek'] = pd.Categorical(messages['date'].dt.weekday_name,
                                       ordered=True,
                                       categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])

In [None]:
messages['timeofday'] = messages['date'].dt.hour + messages['date'].dt.minute / 60

In [None]:
messages['nyear'] = messages['date'].dt.year + messages['date'].dt.dayofyear/365.25 + messages['timeofday']/24/365.25 

In [None]:
messages.plot.scatter('nyear','timeofday',s=2)

In [None]:
messages.nyear.hist()

In [None]:
messages.timeofday.hist()

In [None]:
messages.dayofweek.value_counts()

In [None]:
counts = messages.dayofweek.value_counts(sort=False)
counts.plot(kind='bar')