# Downloading and parsing newsletter emails

### Goal

* Download all newsletter emails (filtering can be done later)
* Parse each email and extract all article links
* Store information to feed the newspaper scraper

In [28]:
import re
import email
import pprint
import imaplib
import getpass

pp = pprint.PrettyPrinter(indent=4)

M = imaplib.IMAP4_SSL('imap.gmail.com')

In [29]:
# Retrieve credentials
user = 'amoya@bluecap.com' #input('User: ')
passwd = getpass.getpass()

# Login 
M.login(user, passwd)
M.select()

········


('OK', ['4551'])

In [31]:
# Search all emails from ...
typ, data = M.search(None, '(FROM "lizquierdo@bluecap.com")')

n_last = data[0].split()[-1] # last email (for num in data[0].split(): # recursive for all emails)

typ, data = M.fetch(n_last, '(RFC822)') # RFC822: Standard for ARPA Internet Text Messages

In [32]:
email_obj = {}

for response_part in data:

    if isinstance(response_part, tuple):

        msg = email.message_from_string(response_part[1].decode('utf-8'))

        email_obj['from'] = msg['from']
        email_obj['to'] = msg['to']
        email_obj['subject'] = msg['subject']
        email_obj['date'] = msg['date']

        if msg.is_multipart():
            raw_body = msg.get_payload()[0].get_payload()
        else:
            raw_body = msg.get_payload()
        
        # email_obj['body'] = raw_body

        # extracting links to articles
        try:
            body = raw_body.replace("=\r\n", "")
            body = body.replace("\r", "")
        except:
            raw_body = raw_body[0].get_payload()
            body = raw_body.replace("=\r\n", "")
            body = body.replace("\r", "")

        urls_raw = re.findall("(?P<url>https?://[^\s]+)", body)[:-1]
        
        # sanity
        urls = [url.split(">")[0] for url in urls_raw]
        
        if urls:
            email_obj['urls'] = urls


pp.pprint(email_obj)

{   'date': 'Tue, 10 Apr 2018 10:15:51 +0200',
    'from': 'Laia Izquierdo <lizquierdo@bluecap.com>',
    'subject': 'Bluecap Banking Breakfast',
    'to': 'Bluecap Global <bluecapglobal@bluecap.com>',
    'urls': [   'https://www.elconfidencial.com/espana/comunidad-valenciana/2018-04-09/goirigolzarri-bankia-fusion-bbva-roman-escolano_1547258/',
                'http://www.expansion.com/valencia/2018/04/09/5acb8ca046163fe26b8b45cf.html',
                'https://cincodias.elpais.com/cincodias/2018/04/09/companias/1523289300_337618.html',
                'https://www.elconfidencial.com/mercados/2018-04-10/bankia-sube-un-7-el-dividendo-para-repartir-340-m-y-aumenta-el-pay-out-en-efectivo_1547458/',
                'https://www.elconfidencial.com/tags/empresas/allianz-6267/',
                'https://www.elconfidencial.com/empresas/2018-04-10/santander-popular-aegon-allianz-guerra-millones_1547385/',
                'https://www.elconfidencial.com/espana/2018-04-09/banco-santander-17-mill

In [33]:
for url in email_obj['urls']:
    print url

https://www.elconfidencial.com/espana/comunidad-valenciana/2018-04-09/goirigolzarri-bankia-fusion-bbva-roman-escolano_1547258/
http://www.expansion.com/valencia/2018/04/09/5acb8ca046163fe26b8b45cf.html
https://cincodias.elpais.com/cincodias/2018/04/09/companias/1523289300_337618.html
https://www.elconfidencial.com/mercados/2018-04-10/bankia-sube-un-7-el-dividendo-para-repartir-340-m-y-aumenta-el-pay-out-en-efectivo_1547458/
https://www.elconfidencial.com/tags/empresas/allianz-6267/
https://www.elconfidencial.com/empresas/2018-04-10/santander-popular-aegon-allianz-guerra-millones_1547385/
https://www.elconfidencial.com/espana/2018-04-09/banco-santander-17-millones-euros-valores-santander_1547207/
http://www.expansion.com/empresas/banca/2018/04/09/5acb723322601d437d8b459d.html
https://cincodias.elpais.com/cincodias/2018/04/09/midinero/1523282687_050543.html
https://cincodias.elpais.com/cincodias/2018/04/09/mercados/1523291997_592348.html
http://www.expansion.com/empresas/banca/2018/04/10

In [34]:
M.close()

M.logout()

('BYE', ['LOGOUT Requested'])