In [16]:
import os, inspect
import pandas as pd
import numpy as np
import datetime as dt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.io as pio

plotly.tools.set_credentials_file(username='aidinraad', api_key='3gvKoI4MsBo6O3xPchfv')

In [2]:
topn = 5
CURR_DIR =  os.path.dirname(inspect.getabsfile(inspect.currentframe()))
ROOT_DIR =  os.path.dirname(CURR_DIR)

### Load senders/recipients data

In [3]:
path = os.path.join(ROOT_DIR,"enron","data", "ext")
ffname = os.path.join(path, "enron-recipients.csv")
df_recipients = pd.read_csv(ffname, parse_dates=["datetime"])
df_recipients.rename(columns={"recipient": "person"}, inplace=True)
ffname = os.path.join(path, "enron-senders.csv")
df_senders = pd.read_csv(ffname, parse_dates=["datetime"])
df_senders.rename(columns={"sender": "person"}, inplace=True)
print(df_recipients.shape, df_senders.shape)
# get yearMonth attribute as "date"
df_senders["date"] = df_senders.datetime.dt.to_period("M")
df_recipients["date"] = df_recipients.datetime.dt.to_period("M")

(435816, 2) (205661, 2)


In [4]:
df_senders.head()

Unnamed: 0,datetime,person,date
0,1998-05-27 17:31:00,christopher behney,1998-05
1,1998-10-30 17:43:00,mark legal taylor,1998-10
2,1998-10-30 17:56:00,mark legal taylor,1998-10
3,1998-10-30 18:02:00,mark legal taylor,1998-10
4,1998-10-30 19:06:00,mark legal taylor,1998-10


In [5]:
df_recipients.head()

Unnamed: 0,datetime,person,date
0,1998-05-27 17:31:00,toni p schulenburg,1998-05
1,1998-05-27 17:31:00,mary hain,1998-05
2,1998-10-30 17:43:00,marc.r.cutler@bankamerica.com,1998-10
3,1998-10-30 17:56:00,marc.r.cutler@bankamerica.com,1998-10
4,1998-10-30 18:02:00,shari stack,1998-10


### Get sender counts data frame

In [6]:
# group by person and  year-month dates
gpby = [df_senders.person, df_senders.date]
df_sender_counts = df_senders.groupby(gpby)["person"].count()
df_sender_counts = df_sender_counts.to_frame(name="sent")
df_sender_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent
person,date,Unnamed: 2_level_1
*misc exchange customer activities\tcustomeractivities@gulfsouthpl.com@enron,2001-11,1
.,2001-11,2
.,2001-12,1
a devries,2000-10,1
aa wayne caa,2002-01,1


### Get recipient counts data frame

In [7]:
# group by person and  year-month dates
gpby = [df_recipients.person, df_recipients.date]
df_recipient_counts = df_recipients.groupby(gpby)["person"].count()
df_recipient_counts = df_recipient_counts.to_frame(name="received")
df_recipient_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,received
person,date,Unnamed: 2_level_1
- *joan.veselack@enron.com,2000-02,1
- *murex@manado.wasantara.net.id,2000-06,1
"- *porter, jeffrey",2000-02,2
"- *stiles, marianne",2000-02,1
"'andrea.v.reed@enron.com' <andrea.v.reed, 'anne.c.koehler@enron.com'<anne.c.koehler,>",2000-08,1


### Get counts data frame

In [8]:
df_counts = df_sender_counts.join(df_recipient_counts, how="outer") \
    .fillna(0) \
    .applymap(lambda x: int(x)) \
    .sort_values(by="sent", ascending=False)
df_counts.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,sent,received
person,date,Unnamed: 2_level_1,Unnamed: 3_level_1
pete davis,2002-01,1508,696
notes,2001-10,955,241
blank,2002-11,850,0
outlook,2001-04,754,6
notes,2001-11,721,120
schedule,2002-01,696,0
pete davis,2001-04,680,2
pete davis,2001-10,603,0
jeff dasovich,2001-10,586,193
jeff dasovich,2001-04,518,11


### Create person-sent data frame

In [9]:
df_person_sent = df_counts.reset_index()
df_person_sent = df_person_sent.groupby(df_person_sent.person)[["sent"]] \
    .sum() \
    .reset_index() \
    .sort_values(by="sent", ascending=False) \
    .reset_index()
# save persn-sent
ffname = os.path.join(ROOT_DIR,"enron","data", "ext", 
                      "enron-person-sent-summary.csv")
df_person_sent.to_csv(ffname, index=False)
df_person_sent.head()


Unnamed: 0,index,person,sent
0,15824,jeff dasovich,5232
1,30170,sara shackleton,4591
2,26694,pete davis,3898
3,5909,chris germany,3703
4,25585,notes,3314


### Compute relative contact

In [10]:
df_counts["relcontact"] = (df_counts.received - df_counts.sent).abs() / \
    df_counts[["received", "sent"]].max(axis=1)
df_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sent,received,relcontact
person,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pete davis,2002-01,1508,696,0.538462
notes,2001-10,955,241,0.747644
blank,2002-11,850,0,1.0
outlook,2001-04,754,6,0.992042
notes,2001-11,721,120,0.833564


### Get the most prolific senders

In [11]:
# the most prolific senders, top-N senders
topn_senders = df_person_sent.person[0:topn].tolist()

### Plot trends

In [12]:
colors = list(range(40))
title = 'Main Source for News'
# labels = ['Television', 'Newspaper', 'Internet', 'Radio']
colors = [" #ff0000", "#b2004c", "#8c0073", "#5900a6", "#0000ff"]
# colors = ['hsl('+str(h)+',60%'+',40%)' for h in np.linspace(0, 360, 20)]
mode_size = [8, 8, 12, 8]
line_size = [2, 2, 4, 2]
id = pd.date_range(start='1998-08', end="2002-12", freq='MS').to_period("M")

In [13]:
data = []

for i in range(0, topn):
    df_i = df_counts.loc[topn_senders[i], :]
    df_i = df_i.reindex(id, fill_value=0)
    
    trace = go.Scatter(
        x=list(df_i.index.astype(str).values),
        y=df_i.relcontact,
        name = topn_senders[i].title(),
        line = dict(color=colors[i]),
#         colorscale='Viridis',
        opacity = 0.9,
        mode="lines"
    )

    data.append(trace)

In [14]:
layout = dict(
    title=f"Contact Relative Ratio by top-{topn} senders between '05.1998' and '12.2012'",
    yaxis = dict(title = 'Contact Relative Ratio'),
    xaxis=dict(
        title="Time",
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(
            visible = True
        ),
        type='date'
    )
)

annotations = []
                         
layout['annotations'] = annotations

fig = dict(data=data, layout=layout)
py.iplot(fig, filename = "Sent Email Count")

In [20]:

ffname = os.path.join(path, "relative-contact-plot")
pio.write_image(fig, ffname + ".webp")
pio.write_image(fig, ffname + ".png")
py.plot(fig, filename="Sent Email Count")

'https://plot.ly/~aidinraad/6'