# Downloading and parsing newsletter emails

### Goal

* Download all newsletter emails (filtering can be done later)
* Parse each email and extract all article links
* Store information to feed the newspaper scraper

---

In [1]:
import re
import email
import pprint
import imaplib
import getpass

pp = pprint.PrettyPrinter(indent=4)

M = imaplib.IMAP4_SSL('imap.gmail.com')

In [2]:
# Retrieve credentials
user = 'amoya@bluecap.com' #input('User: ')
passwd = getpass.getpass()

# Login 
M.login(user, passwd)
M.select()

········


('OK', ['4552'])

In [3]:
# Search all emails from ...
typ, data = M.search(None, '(FROM "lizquierdo@bluecap.com")')

n_last = data[0].split()[-1] # last email (for num in data[0].split(): # recursive for all emails)

typ, data = M.fetch(n_last, '(RFC822)') # RFC822: Standard for ARPA Internet Text Messages

In [4]:
email_obj = {}

for response_part in data:

    if isinstance(response_part, tuple):

        msg = email.message_from_string(response_part[1].decode('utf-8'))

        email_obj['from'] = msg['from']
        email_obj['to'] = msg['to']
        email_obj['subject'] = msg['subject']
        email_obj['date'] = msg['date']

        if msg.is_multipart():
            raw_body = msg.get_payload()[0].get_payload()
        else:
            raw_body = msg.get_payload()
        
        # email_obj['body'] = raw_body

        # extracting links to articles
        try:
            body = raw_body.replace("=\r\n", "")
            body = body.replace("\r", "")
        except:
            raw_body = raw_body[0].get_payload()
            body = raw_body.replace("=\r\n", "")
            body = body.replace("\r", "")

        urls_raw = re.findall("(?P<url>https?://[^\s]+)", body)[:-1]
        
        # sanity
        urls = [url.split(">")[0] for url in urls_raw]
        
        if urls:
            email_obj['urls'] = urls


pp.pprint(email_obj)

{   'date': 'Wed, 11 Apr 2018 10:45:25 +0200',
    'from': 'Laia Izquierdo <lizquierdo@bluecap.com>',
    'subject': 'Bluecap Banking Breakfast',
    'to': 'Bluecap Global <bluecapglobal@bluecap.com>',
    'urls': [   'https://cincodias.elpais.com/cincodias/2018/04/10/midinero/',
                'http://www.expansion.com/empresas/banca/2018/04/11/',
                'http://www.expansion.com/empresas/banca/2018/04/11/',
                'https://cincodias.elpais.com/cincodias/2018/04/10/',
                'https://cincodias.elpais.com/cincodias/2018/04/10/companias/1523348837_627302.html',
                'http://www.expansion.com/valencia/2018/04/10/5acc92a746163fd4668b464d.html',
                'http://www.expansion.com/empresas/banca/2018/04/11/',
                'http://www.expansion.com/empresas/banca/2018/04/11/',
                'https://www.elconfidencial.com/empresas/2018-04-11/banco-',
                'http://www.expansion.com/empresas/banca/2018/04/10/',
                'http

In [5]:
for url in email_obj['urls']:
    print url

https://cincodias.elpais.com/cincodias/2018/04/10/midinero/
http://www.expansion.com/empresas/banca/2018/04/11/
http://www.expansion.com/empresas/banca/2018/04/11/
https://cincodias.elpais.com/cincodias/2018/04/10/
https://cincodias.elpais.com/cincodias/2018/04/10/companias/1523348837_627302.html
http://www.expansion.com/valencia/2018/04/10/5acc92a746163fd4668b464d.html
http://www.expansion.com/empresas/banca/2018/04/11/
http://www.expansion.com/empresas/banca/2018/04/11/
https://www.elconfidencial.com/empresas/2018-04-11/banco-
http://www.expansion.com/empresas/banca/2018/04/10/
https://cincodias.elpais.com/cincodias/2018/04/10/companias/1523355275_241527.html
https://cincodias.elpais.com/cincodias/2018/04/10/mercados/
http://www.expansion.com/economia/2018/04/11/5acdb52b22601d1d088b458f.html
https://www.elconfidencial.com/empresas/2018-04-11/
http://www.expansion.com/empresas/banca/2018/04/10/
https://www.elconfidencial.com/mercados/2018-04-11/


In [6]:
M.close()

M.logout()

('BYE', ['LOGOUT Requested'])