In [36]:
import os
from dotenv import load_dotenv
from newsletter.utils import gmail_service
from base64 import urlsafe_b64decode
from bs4 import BeautifulSoup
import requests

In [2]:
load_dotenv()
creds_file = os.environ["NEWSLETTER_CREDS_FILE"]
token_file = os.environ["NEWSLETTER_TOKEN_FILE"]
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]

In [3]:
gmail = gmail_service(SCOPES, creds_file, token_file)

In [22]:
medium_id = "17b63e5c9c10d741"
crunchbase_id = "17b63b1c953802a4"
deeplearning_id = "17b5ae423b5dff18"

In [24]:
dlmsg = gmail.users().messages().get(userId="me", id=deeplearning_id).execute()

In [5]:
medium_msg = gmail.users().messages().get(userId="me", id=medium_id).execute()
crunchbase_msg = gmail.users().messages().get(userId="me", id=crunchbase_id).execute()


In [49]:
dlmsg.keys()

dict_keys(['id', 'threadId', 'labelIds', 'snippet', 'payload', 'sizeEstimate', 'historyId', 'internalDate'])

In [50]:
dlmsg["snippet"]

'Dear friends, Say you&#39;ve trained a learning algorithm and found that it works well on many examples but performs poorly on a particular subset, or slice, of the data. What can you do? View in'

In [51]:
dlmsg["payload"].keys()

dict_keys(['partId', 'mimeType', 'filename', 'headers', 'body', 'parts'])

In [55]:
len(dlmsg["payload"]["headers"])

28

In [62]:
for idx, header in enumerate(dlmsg["payload"]["headers"]):
    print(idx, header["name"])

0 Delivered-To
1 Received
2 X-Google-Smtp-Source
3 X-Received
4 ARC-Seal
5 ARC-Message-Signature
6 ARC-Authentication-Results
7 Return-Path
8 Received
9 Received-SPF
10 Authentication-Results
11 Received
12 DKIM-Signature
13 DKIM-Signature
14 Return-Path
15 X-HS-Cid
16 List-Unsubscribe
17 Date
18 From
19 Reply-To
20 To
21 Message-ID
22 Subject
23 MIME-Version
24 Content-Type
25 Precedence
26 X-Report-Abuse-To
27 Feedback-ID


In [63]:
dlmsg["payload"]["headers"][22]

{'name': 'Subject',
 'value': "The Batch: Apple Weakens Privacy, AI's Invention Wins A Patent, Deere All-In For Robot Tractors, Atari-Playing Algo Learns New Trick"}

In [64]:
dlmsg["payload"]["headers"][18]

{'name': 'From', 'value': '"DeepLearning.AI" <thebatch@deeplearning.ai>'}

In [26]:
dl_html = urlsafe_b64decode(dlmsg["payload"]["parts"][1]["body"]["data"]).decode("utf-8")

In [27]:
with open("deeplearning.html", "wt") as f:
    print(dl_html, file=f)

In [29]:
soup = BeautifulSoup(dl_html, "html.parser")

In [33]:
links = set()
for atag in soup.find_all("a"):
    links.add(atag["href"])

In [35]:
with open("dl_links.txt", "wt") as f:
    for link in links:
        print(link, file=f)

In [38]:
links = list(links)

In [39]:
resp = requests.get(links[0])

In [43]:
s = BeautifulSoup(resp.text, "html.parser")

In [44]:
s.title

In [47]:
s.head

<head>
<meta content="never" name="referrer"/>
<script>
function toS() {
  var num = 0;
  var reg = /./;
  reg.toString = function() {
    num++;
    return 'toS';
  }
  console.debug(reg);
  if (num > 1) {
    return 'b';
  }
  return 'p';
}

function driver() {
  if (navigator.webdriver) {
    return 'b';
  }
  return 'p';
}

function lang() {
  return navigator.languages;
}

function plugins() {
  return navigator.plugins.length;
}

function perm2() {
  var permissions = window.navigator.permissions;
  if (permissions.query.toString() !== 'function query() { [native code] }') {
    return 'b'
  }
  if (permissions.query.toString.toString() !== 'function toString() { [native code] }') {
    return 'b';
  }
  if (permissions.query.toString.hasOwnProperty('[[Handler]]')
      && permissions.query.toString.hasOwnProperty('[[Target]]')
      && permissions.query.toString.hasOwnProperty('[[IsRevoked]]')) {
    return 'b';
  }
  if (permissions.hasOwnProperty('query')) {
    return 'b';
  

In [7]:
cb_payload = crunchbase_msg["payload"]

In [9]:
cb_payload["mimeType"]

'multipart/alternative'

In [10]:
cb_payload["body"]["size"]

0

In [11]:
len(cb_payload["parts"])

2

In [12]:
cbp0 = cb_payload["parts"][0]

In [13]:
cbp0["mimeType"]

'text/plain'

In [14]:
cbp0["body"]["size"]

18539

In [15]:
urlsafe_b64decode(cbp0["body"]["data"])

b'Crunchbase Daily \n\n\n <https://www.crunchbase.com/> \n\n\n\nAugust 20, 2021\n\n\nCleantech funds are scaling up in 2021 \n<https://news.crunchbase.com/news/cleantech-funds-are-scaling-up-in-2021/?utm_source=cb_daily&utm_medium=email&utm_campaign=20210820&utm_content=intro&utm_term=content>\n\nWhether the focus is clean manufacturing, energy efficiency, renewables, \nsustainable packaging, or a host of other focus areas, the broad bet is that we \ncan take incremental steps to curtail our carbon-spewing ways. We take a look \nat the startups and funding in the space.\n\nWant more women in startups? Here\xe2\x80\x99s how. \n<https://news.crunchbase.com/news/want-to-see-more-women-in-the-startup-world-a-female-founder-shares-her-top-4-asks/?utm_source=cb_daily&utm_medium=email&utm_campaign=20210820&utm_content=intro&utm_term=content>\n\nSimpliFed <https://www.crunchbase.com/organization/simplifed> founder Andrea \nIppolito <https://www.crunchbase.com/person/andrea-ippolito-b439> sugge

In [16]:
cbp1 = cb_payload["parts"][1]
print(cbp1["mimeType"])
print(cbp1["body"]["size"])

text/html
87697


In [20]:
cb_html = urlsafe_b64decode(cbp1["body"]["data"]).decode("utf-8")

In [21]:
with open("crunchbase.html", "wt") as f:
    print(cb_html, file=f)

In [None]:
print(type(message))

In [None]:
message.keys()

In [None]:
message["payload"].keys()

In [None]:
type(message["payload"]["body"])

In [None]:
message["payload"]["body"].keys()

In [None]:
message["payload"]["body"]

In [None]:
type(message["payload"]["parts"])

In [None]:
len(message["payload"]["parts"])

In [None]:
type(message["payload"]["parts"][0])

In [None]:
message["payload"]["parts"][0].keys()

In [None]:
type(message["payload"]["parts"][0]["body"])

In [None]:
message["payload"]["parts"][0]["body"].keys()

In [None]:
message["payload"]["parts"][0]["body"]["data"]

In [None]:
b64decode(message["payload"]["parts"][0]["body"]["data"])

```
message
    payload
        mimeType
        body
            size = 0
        [parts]
            mimeType
            body
                size
                data
            
```

In [None]:
payload = message["payload"]

In [None]:
payload["mimeType"]

In [None]:
payload["body"]["size"]

In [None]:
payload["parts"][0]["mimeType"]

In [None]:
payload["parts"][1]["mimeType"]

In [None]:
part1 = payload["parts"][1]

In [None]:
part1["body"]["size"]

In [None]:
html = urlsafe_b64decode(part1["body"]["data"])

In [None]:
part0 = payload["parts"][0]

In [None]:
txt = urlsafe_b64decode(part0["body"]["data"]).decode("utf-8")
with open("msg.txt", "wt") as f:
    print(txt, file=f)

In [None]:
soup = BeautifulSoup(html, "html.parser")

In [None]:
with open("msg.html", "wt") as f:
    print(soup.prettify(), file=f)

In [None]:
atags = soup.find_all("a")
links = []
for atag in atags:
    link = atag["href"]
    links.append(link)

In [None]:
len(links)

In [None]:
links[0]

In [None]:
with open("links.txt", "wt") as f:
    for link in links:
        print(link, file=f)

In [None]:
links = set(links)

In [None]:
len(links)