In [2]:
# app
import flask
from flask_pymongo import PyMongo

# data wrangling
import pandas as pd

# html gather/parse
from bs4 import BeautifulSoup
import requests
import urllib.parse

# browser
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from resources import username, password, cluster

In [404]:
username

'ajmorrison'

In [4]:
url_news = 'https://mars.nasa.gov/news/'

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11',
    'Content-Type': 'text/html',
}

newsrequest = requests.get(url_news, headers = headers)
newsrequest

<Response [200]>

In [3]:
# just running an simple request does *not* gather all articles--they are populated after page is loaded by JS
# headless browser necessary

soup = BeautifulSoup(newsrequest.text, 'html.parser')
print(soup.prettify())

NameError: name 'newsrequest' is not defined

In [5]:
#browser instance
browser = Browser('firefox', headless=False)

In [20]:
#latest news: title and paragraph text
    
browser.visit(url_news)
latest_news_title = browser.find_by_css('div[class="content_title"]').first.text
latest_news_article = browser.find_by_css('div[class="article_teaser_body"]').first.text

In [23]:
#title
print(latest_news_title)
#article
print(latest_news_article)

#news dict
news_dict = {
    'title' : latest_news_title,
    'article' : latest_news_article
}

#verify
news_dict

NASA Wins Two Emmy Awards for Interactive Mission Coverage
NASA-JPL's coverage of the Mars InSight landing earns one of the two wins, making this the NASA center's second Emmy.


{'title': 'NASA Wins Two Emmy Awards for Interactive Mission Coverage',
 'article': "NASA-JPL's coverage of the Mars InSight landing earns one of the two wins, making this the NASA center's second Emmy."}

In [82]:
#jpl image

url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

browser.visit(url_jpl)

html = browser.html

In [88]:
soup = BeautifulSoup(html)
print(soup.prettify())

<html class="js flexbox canvas canvastext webgl no-touch geolocation postmessage no-websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients no-cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers applicationcache svg inlinesvg smil svgclippaths -moz-" style="">
 <!-- START HEADER: "DEFAULT" -->
 <head>
  <script async="" src="https://script.crazyegg.com/pages/scripts/0025/5267.js?435771" type="text/javascript">
  </script>
  <script src="https://m.addthis.com/live/red_lojson/300lo.json?si=5d819e953ea92165&amp;bkl=0&amp;bl=1&amp;pdt=334&amp;sid=5d819e953ea92165&amp;pub=&amp;rev=v8.27.9-wp&amp;ln=en&amp;pc=men&amp;cb=0&amp;ab=-&amp;dp=www.jpl.nasa.gov&amp;fp=spaceimages%2F%3Fsearch%3D%26category%3DMars&amp;fr=&amp;of=2&amp;pd=0&amp;irt=0&amp;vcl=0&amp;md=0&amp;ct=1&am

In [131]:
#tag
img_html = soup.find(id="full_image")

#strip partial path
img_str = str(img_html).split('"')[7]

#combine
img_path = urllib.parse.urljoin(url_jpl, img_str)
img_path

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA12831_ip.jpg'

In [349]:
#feat img
print(img_path)


#img dict
feat_img_dict = {
    'img' : img_path
}

#verify
feat_img_dict

https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA12831_ip.jpg


{'img': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA12831_ip.jpg'}

In [139]:
#mars weather
url_weather = 'https://twitter.com/marswxreport?lang=en'

#visit
browser.visit(url_weather)

#find str
weather_data = browser.find_by_css('div[class="js-tweet-text-container"]').first.text

#build dict
weather_dict = {
    'weather_data' : weather_data
}

#display
weather_dict

{'weather_data': 'InSight sol 286 (2019-09-16) low -101.2ºC (-150.1ºF) high -26.5ºC (-15.8ºF)\nwinds from the SSE at 5.1 m/s (11.4 mph) gusting to 16.9 m/s (37.8 mph)\npressure at 7.50 hPa'}

In [6]:
#mars facts/table

url_facts = 'https://space-facts.com/mars/'

#visit
browser.visit(url_facts)

#html
table_html_full = browser.html

#soup
soup_table = BeautifulSoup(table_html_full)

soup_table

<html lang="en-US"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://gmpg.org/xfn/11" rel="profile"/><link href="https://space-facts.com/wp/xmlrpc.php" rel="pingback"/> <script src="https://www.googletagservices.com/activeview/js/current/osd.js?cb=%2Fr20100101"></script><script id="google_shimpl" src="https://pagead2.googlesyndication.com/pagead/js/r20190917/r20190131/show_ads_impl.js"></script><script>function fvmuag(){if(navigator.userAgent.match(/x11.*fox\/54|oid\s4.*xus.*ome\/62|oobot|ighth|tmetr|eadles|ingdo/i))return!1;if(navigator.userAgent.match(/x11.*ome\/75\.0\.3770\.100/i)){var e=screen.width,t=screen.height;if("number"==typeof e&&"number"==typeof t&&862==t&&1367==e)return!1}return!0}</script><title>Mars Facts - Interesting Facts about Planet Mars</title> <!-- Jetpack Site Verification Tags --><meta content="OI26qBRJ2SI673vDUn_KgAd_H89WIX3IEgf6TZbhgDs" name="google-site-verification"/><meta content="98809767

In [27]:
# df method

table_html = soup_table.find(id="tablepress-p-mars")
table_html

table_df = pd.read_html(table_html.prettify(), flavor='bs4')
df = table_df[0]

df = df.rename(columns={0:'attribute', 1:'observation'})

df_html = df.to_html(index=False)

In [28]:
df_html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>attribute</th>\n      <th>observation</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg  (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (  Phobos  &amp;  Deimos  )</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km  (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

In [29]:
df_html = df.to_html().replace('\n', '')
df_html

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>attribute</th>      <th>observation</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <th>1</th>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <th>2</th>      <td>Mass:</td>      <td>6.39 × 10^23 kg  (0.11 Earths)</td>    </tr>    <tr>      <th>3</th>      <td>Moons:</td>      <td>2 (  Phobos  &amp;  Deimos  )</td>    </tr>    <tr>      <th>4</th>      <td>Orbit Distance:</td>      <td>227,943,824 km  (1.38 AU)</td>    </tr>    <tr>      <th>5</th>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>6</th>      <td>Surface Temperature:</td>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>7</th>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>8</th>      <td>Recorded By:</td>      <td>Egypt

In [272]:
table_html

<table class="tablepress tablepress-id-p-mars" id="tablepress-p-mars"><tbody><tr class="row-1 odd"><td class="column-1"><strong>Equatorial Diameter:</strong></td><td class="column-2">6,792 km<br/></td></tr><tr class="row-2 even"><td class="column-1"><strong>Polar Diameter:</strong></td><td class="column-2">6,752 km<br/></td></tr><tr class="row-3 odd"><td class="column-1"><strong>Mass:</strong></td><td class="column-2">6.39 × 10^23 kg<br/> (0.11 Earths)</td></tr><tr class="row-4 even"><td class="column-1"><strong>Moons:</strong></td><td class="column-2">2 (<a href="https://space-facts.com/moons/phobos/">Phobos</a> &amp; <a href="https://space-facts.com/moons/deimos/">Deimos</a>)</td></tr><tr class="row-5 odd"><td class="column-1"><strong>Orbit Distance:</strong></td><td class="column-2">227,943,824 km<br/> (1.38 AU)</td></tr><tr class="row-6 even"><td class="column-1"><strong>Orbit Period:</strong></td><td class="column-2">687 days (1.9 years)<br/></td></tr><tr class="row-7 odd"><td cla

In [358]:
table_dict = {
    'table' : str(table_html)
}
table_dict

{'table': '<table class="tablepress tablepress-id-p-mars" id="tablepress-p-mars"><tbody><tr class="row-1 odd"><td class="column-1"><strong>Equatorial Diameter:</strong></td><td class="column-2">6,792 km<br/></td></tr><tr class="row-2 even"><td class="column-1"><strong>Polar Diameter:</strong></td><td class="column-2">6,752 km<br/></td></tr><tr class="row-3 odd"><td class="column-1"><strong>Mass:</strong></td><td class="column-2">6.39 × 10^23 kg<br/> (0.11 Earths)</td></tr><tr class="row-4 even"><td class="column-1"><strong>Moons:</strong></td><td class="column-2">2 (<a href="https://space-facts.com/moons/phobos/">Phobos</a> &amp; <a href="https://space-facts.com/moons/deimos/">Deimos</a>)</td></tr><tr class="row-5 odd"><td class="column-1"><strong>Orbit Distance:</strong></td><td class="column-2">227,943,824 km<br/> (1.38 AU)</td></tr><tr class="row-6 even"><td class="column-1"><strong>Orbit Period:</strong></td><td class="column-2">687 days (1.9 years)<br/></td></tr><tr class="row-7 o

In [337]:
#mars hemispheres --error method I thought would work

url_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_hemi)

hemi_element_list = browser.find_by_css('h3')
hemisphere_urls = {}

# this loop gives an 'element reference' is stale error--must reinitiate the browser
for hemisphere in hemi_element_list:
    hemisphere.click()
    link = browser.find_by_text('Original').first['href']
    title = browser.find_by_css('h2[class="title"]').first.text
    hemisphere_urls[title] = link
    browser.back()

StaleElementReferenceException: Message: The element reference of <h3> is stale; either the element is no longer attached to the DOM, it is not in the current frame context, or the document has been refreshed


In [343]:
# slower method, no loop, hardcoded element indexes

hemisphere_urls = {}

url_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

browser.visit(url_hemi)
hemi_element_list = browser.find_by_css('h3')
hemi_element_list[0].click()

hemi_url_1 = browser.find_by_text('Sample').first['href']
hemi_title_1 = browser.find_by_css('h2[class="title"]').first.text

browser.visit(url_hemi)
hemi_element_list = browser.find_by_css('h3')
hemi_element_list[1].click()

hemi_url_2 = browser.find_by_text('Sample').first['href']
hemi_title_2 = browser.find_by_css('h2[class="title"]').first.text

browser.visit(url_hemi)
hemi_element_list = browser.find_by_css('h3')
hemi_element_list[2].click()

hemi_url_3 = browser.find_by_text('Sample').first['href']
hemi_title_3 = browser.find_by_css('h2[class="title"]').first.text

browser.visit(url_hemi)
hemi_element_list = browser.find_by_css('h3')
hemi_element_list[3].click()

hemi_url_4 = browser.find_by_text('Sample').first['href']
hemi_title_4 = browser.find_by_css('h2[class="title"]').first.text

In [345]:
hemisphere_image_urls = [
    {"title": hemi_title_1, "img_url": hemi_url_1},
    {"title": hemi_title_2, "img_url": hemi_url_2},
    {"title": hemi_title_3, "img_url": hemi_url_3},
    {"title": hemi_title_4, "img_url": hemi_url_4},
]

hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]

In [350]:
mars_data = {
        "news": news_dict,
        "feat_img_dict": feat_img_dict,
        "weather_dict": weather_dict,
        "table_dict": table_dict,
        "table_df" : df_html,
        "hemisphere_image_urls": hemisphere_image_urls,
    }

In [357]:
print(mars_data['news']['title'])

NASA Wins Two Emmy Awards for Interactive Mission Coverage


In [398]:
mars_data['hemisphere_image_urls'][0]['img_url']

'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'