In [1]:
import re
import sys
# Used fork of bigbang that doesn't crash when a malformed date appears
# Cloned from https://github.com/bjgfromthe703/bigbang and added to path
sys.path.append('/Users/brendan/data-delving/bigbang-fork')
from bigbang.utils import *
from bigbang.archive import Archive
from bigbang.thread import Thread
import warnings
import pandas as pd
import pickle
from collections import Counter
warnings.filterwarnings('ignore')

Step 1. Create List of Public Archives to Pull Emails From
====================
Here's a list of all the online communities whose emails we will be using in our experiment:

In [2]:
urls = ['http://mail.scipy.org/pipermail/ipython-dev/',
        'http://mail.scipy.org/pipermail/ipython-user/',
        'http://mail.scipy.org/pipermail/scipy-dev/',
        'http://mail.scipy.org/pipermail/scipy-user/',
        'https://lists.centos.org/pipermail/centos/',
        'https://mail.scipy.org/pipermail/numpy-discussion/',
        'http://lists.openstack.org/pipermail/openstack/',
        'https://mail.python.org/pipermail/python-list/',
        'http://lists.ucla.edu/pipermail/religionlaw/',
        'https://pidgin.im/pipermail/support/',
        'https://www.winehq.org/pipermail/wine-users/',
        'https://lists.freebsd.org/pipermail/freebsd-questions/',
        'https://mta.openssl.org/pipermail/openssl-users/',
        'https://mail.haskell.org/pipermail/beginners/',
        'https://mail.haskell.org/pipermail/haskell-cafe/',
        'https://lists.wikimedia.org/pipermail/wikitech-l/',
        'https://lists.blender.org/pipermail/bf-committers/',
        'https://lists.fedoraproject.org/pipermail/devel/',
        'https://lists.dns-oarc.net/pipermail/dns-operations/',
        'http://mailman.nginx.org/pipermail/nginx-devel/',
        'http://mailman.nginx.org/pipermail/nginx/'
        ]

Step 2. Grab Emails & Get Baseline Response Rate
====================
Let's see what the average response rate was for all threads across all archives. BigBang will pull down copies of emails from all the archives listed above, and allow us to see how many participants each thread had. We can take the number of threads with 2 or more participants, divide by the total number of threads, and get the average response rate across all email threads in our sample.

In [3]:
count = 0
replyCount = 0
threads = []  # We will use these threads throughout
try:
    for x in range(len(urls)):
        archive = Archive(urls[x], archive_dir='archives')
        threads += archive.get_threads()
    for thread in threads:
        count += 1
        if thread.get_num_people() > 1:
            replyCount += 1
except Exception as e:
    print 'Error!', e
finally:
    with open('threads.p', 'wb') as outfile:  # Save for later
        pickle.dump(threads, outfile)
    print 'Total threads: ' + str(count)
    print 'Total threads w/ replies: ' + str(replyCount)
    print 'Baseline response rate: ' + '{:.1%}'.format(replyCount * 1.0 / count)

Total threads: 366442
Total threads w/ replies: 173995
Baseline response rate: 47.5%


Around 47.5% of threads got a reply across all the email archives we're using in our experiment. Next up: let's extract different email closings!

Step 3. Use RegEx to Identify Closings
====================
Let's look at the first email of every thread in each archive, and attempt to identify its closing with RegEx.

Our RegEx looks for lines of text that...

1. are no more than three words long.
2. end with a comma, period, or one or more exclamation marks.
3. have a line break preceding and following them.

We will take the last RegEx match (if any) and assume this is the closing. Since we are only looking at the first message of a thread, this should reduce the chance of extracting a closing from quoted text.

(This will yield false positives and miss some closings, but should be a good starting point.)

In [4]:
closingRegex = re.compile(r'\n((?:\w+\s+){0,2}\w+)(!+|,|\.)\n', re.IGNORECASE)

In [5]:
def getClosing(message, regex):
    matches = re.findall(regex, message)
    if matches:
        return matches[-1][0].lower()

Let's go through each online community, see how many threads and participants each have, and see what possible closings, across all communities, appear most often.

In [6]:
closingDict = {}
try:
    for x in range(len(urls)):
        archive = Archive(urls[x], archive_dir='archives')
        myThreads = archive.get_threads()
        print 'Archive ' + urls[x] + ' has ' + str(len(myThreads)) + \
            ' threads; ' + str(len(archive.get_activity())) + ' participants'
    for thread in threads:
        initialMsg = thread.get_content()[0]
        initialMsg = clean_message(initialMsg)
        closing = getClosing(initialMsg, closingRegex)
        if closing is not None:
            if closing in closingDict:
                closingDict[closing] += 1
            else:
                closingDict[closing] = 1
except Exception as e:
    print e
finally:  # if something crashes, we can see the partial results!
    print '-' * 20 + '\nThese were the most frequent (possible) closings:'
    c = Counter(closingDict)
    for k, v in c.most_common(50):
        print '%s: %i' % (k, v)


Archive http://mail.scipy.org/pipermail/ipython-dev/ has 3856 threads; 4942 participants
Archive http://mail.scipy.org/pipermail/ipython-user/ has 4095 threads; 4953 participants
Archive http://mail.scipy.org/pipermail/scipy-dev/ has 6787 threads; 5622 participants
Archive http://mail.scipy.org/pipermail/scipy-user/ has 12552 threads; 5623 participants
Archive https://lists.centos.org/pipermail/centos/ has 51620 threads; 3004 participants
Archive https://mail.scipy.org/pipermail/numpy-discussion/ has 17079 threads; 6131 participants
Archive http://lists.openstack.org/pipermail/openstack/ has 6344 threads; 1196 participants
Archive https://mail.python.org/pipermail/python-list/ has 65824 threads; 6466 participants
Archive http://lists.ucla.edu/pipermail/religionlaw/ has 21760 threads; 4739 participants
Archive https://pidgin.im/pipermail/support/ has 11379 threads; 3332 participants
Archive https://www.winehq.org/pipermail/wine-users/ has 43117 threads; 4073 participants
Archive https:/

OK, most of these are closings! Let's build a new RegEx that searches specifically for these closings, and see how response rates differ based on closing.

In [7]:
newRegex = re.compile(r'\n(thanks|regards|cheers|best regards|'
    r'thanks in advance|thank you|best|kind regards|tia|enjoy|many thanks|'
    r'sincerely|thanks a lot|hth|bye|best wishes|thanks again|hope this helps|'
    r'thx|good luck|appreciated|all the best|thanks and regards|later|take care|'
    r'have fun|please help|yours|ciao|hope that helps|warm regards|with regards)'
    r'(!+|,|\.)\n', re.IGNORECASE)

Step 4. Getting the Average Response Rate for Various Email Closings
====================

In [8]:
newClosingDict = {}
try:
    for thread in threads:
        msgGotReplies = thread.get_num_people() > 1
        initialMsg = thread.get_content()[0]
        initialMsg = clean_message(initialMsg)
        closing = getClosing(initialMsg, newRegex)
        if closing is not None:
            if closing in newClosingDict:
                newClosingDict[closing]['count'] += 1
            else:
                newClosingDict[closing] = {'count': 1, 'replyCount': 0}
            if msgGotReplies:
                newClosingDict[closing]['replyCount'] += 1
except Exception as e:
    print e
finally:
    counts = []
    replyCounts = []
    replyRates = []
    closings = []
    for k, v in newClosingDict.iteritems():
        counts.append(v['count'])
        replyCounts.append(v['replyCount'])
        closings.append(k)
        replyRates.append(1.0 * v['replyCount'] / v['count'])
    d = {'numTimes': pd.Series(counts, index=closings),
         'replies': pd.Series(replyCounts, index=closings),
         'replyRate': pd.Series(replyRates, index=closings)}
    df = pd.DataFrame(d)

In [9]:
# Let's not wait half an hour every time we run this notebook
df = pd.DataFrame(d)
df.index.names = ['closing']
df.to_csv('closing_reply_rates.csv')

(Saving data is always a good bet.)

In [10]:
pd.options.display.float_format = '{:.3f}'.format
df = pd.DataFrame.from_csv('closing_reply_rates.csv')
df = df.sort_values(by='replyRate', ascending=False)
df

Unnamed: 0_level_0,numTimes,replies,replyRate
closing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tia,842,582,0.691
thanks in advance,3093,2031,0.657
thanks a lot,623,409,0.657
appreciated,226,143,0.633
thx,266,168,0.632
thanks,25179,15871,0.63
please help,162,101,0.623
thanks and regards,227,140,0.617
take care,183,111,0.607
many thanks,642,385,0.6


This is interesting but we don't want to try to draw conclusions from closings with a small sample size. Let's only look at closings that appeared 1,000+ times.

In [11]:
# Filter out closings without much of a sample size
df2 = df[df.numTimes >= 1000].sort('replyRate', ascending=False)
df2

Unnamed: 0_level_0,numTimes,replies,replyRate
closing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
thanks in advance,3093,2031,0.657
thanks,25179,15871,0.63
thank you,3117,1806,0.579
cheers,9508,5177,0.544
kind regards,1117,602,0.539
regards,10305,5517,0.535
best regards,3838,2030,0.529
best,1738,889,0.512


Interesting! Let's quickly graph how response rates compare for closings that were seen at least 1,000 times...

In [12]:
# Graph it!
from bokeh.io import output_notebook, push_notebook, show
from bokeh.charts import Bar
from bokeh.charts.attributes import CatAttr
from bokeh.models import NumeralTickFormatter
output_notebook()
label = CatAttr(df=df2, sort=False)
bar = Bar(df2, values='replyRate', label=label, legend=None,
          ylabel='Reply Rate (%)', xlabel='Email Closing')
bar.yaxis.formatter = NumeralTickFormatter(format='0%')
handle = show(bar, notebook_handle=True)

It looks like emails that closed with "thanks in advance", "thanks", or "thank you" got more responses! Next, we will look at this further.
Step 5. Correlating Thankful Email Closings with Response Rate
====================
Let's do a comparison of emails that closed with thanks in advance/thanks/thank you with all other closings that appeared 1,000 times by grouping the data graphed above...

In [13]:
def isThankful(closing):
    if 'thank' in closing:
        return 'Variation of thank you'
    return 'Not a variation of thank you'


grouped = df.drop('replyRate', 1).groupby(isThankful)
grouped = grouped.sum()
grouped['replyRate'] = grouped['replies'] / grouped['numTimes']
grouped

Unnamed: 0,numTimes,replies,replyRate
Not a variation of thank you,32587,17086,0.524
Variation of thank you,33237,20793,0.626


In [14]:
label = CatAttr(df=grouped, sort=False)
bar = Bar(grouped, values='replyRate', label=label, legend=None,
          ylabel='Reply Rate (%)', xlabel='Email Closing')
bar.yaxis.formatter = NumeralTickFormatter(format='0%')
handle = show(bar, notebook_handle=True)

There's a clear difference for those emails that closed with gratitude!

One last comparison: how do emails closing with any variation of thanks/thank you (e.g., includes "thank") compare to all other emails, with regards to response rate? (This includes additional variations of "thanks" in the thankful category, and includes emails with any or no closing in the "other" category.)

In [15]:
thankCounter = {'replyCount': 0, 'count': 0}
otherCounter = {'replyCount': 0, 'count': 0}
try:
    for thread in threads:
        initialMsg = thread.get_content()[0]
        initialMsg = clean_message(initialMsg)
        hasReply = thread.get_num_people() > 1
        closing = getClosing(initialMsg, closingRegex)
        if closing is not None and 'thank' in closing:
            thankCounter['count'] += 1
            if hasReply:
                thankCounter['replyCount'] += 1
        else:
            otherCounter['count'] += 1
            if hasReply:
                otherCounter['replyCount'] += 1
except Exception as e:
    print e
finally:  # if something crashes, we can see the partial results!
    print 'Emails with thankful closings\n' + '-' * 20
    print 'Email count: ' + str(thankCounter['count'])
    print 'Reply count: ' + str(thankCounter['replyCount'])
    print 'Response rate: ' + str(thankCounter['replyCount'] * 1.0 /
                                  thankCounter['count'])
    print '\n'
    print 'All other emails\n' + '-' * 20
    print 'Email count: ' + str(otherCounter['count'])
    print 'Reply count: ' + str(otherCounter['replyCount'])
    print 'Response rate: ' + str(otherCounter['replyCount'] * 1.0 /
                                  otherCounter['count'])

Emails with thankful closings
--------------------
Email count: 31923
Reply count: 19961
Response rate: 0.625285844062


All other emails
--------------------
Email count: 334519
Reply count: 154034
Response rate: 0.460464129093


There's quite a difference in emails that closed with a variation of thanks or thank you, compared to all other emails!