In [1]:
# Import the necessary Python modules
import os
from bs4 import BeautifulSoup as bs
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import requests
import numpy as np
import pandas as pd
import time
import re

In [2]:
def get_chrome():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "D:/Drivers/chromedriver"}
    browser = Browser('chrome', **executable_path, headless=False)
    return browser

### NASA Mars News

In [3]:
# Render NASA website in Chrome adding some delay to settle display
browser  = get_chrome()
url = "https://mars.nasa.gov/news/"
browser.visit(url)
time.sleep(2)

In [4]:
# Get HTML object and process it with Beautiful Soup
html = browser.html 
soup = bs(html, 'html.parser')
#print(soup.prettify())

In [5]:
# Start testing the returned object  
title = soup.title.text
print(title)

News  – NASA’s Mars Exploration Program 


In [6]:
# View part of HTML code to find tags and classes needed to gather the rerquired text
results = soup.find_all('div', class_='slide', limit=20)
for i in range(1): 
    print(results[0])

<div class="slide slick-slide slick-cloned" index="-2" style="width: 352px;">
<div class="image_and_description_container">
<a href="/news/8622/virginia-middle-school-student-earns-honor-of-naming-nasas-next-mars-rover/">
<div class="rollover_description">
<div class="rollover_description_inner">
NASA chose a seventh-grader from Virginia as winner of the agency's "Name the Rover" essay contest. Alexander Mather's entry for "Perseverance" was voted tops among 28,000 entries. 
</div>
<div class="overlay_arrow">
<img alt="More" src="/assets/overlay-arrow.png"/>
</div>
</div>
<img alt="Virginia Middle School Student Earns Honor of Naming NASA's Next Mars Rover" class="img-lazy" data-lazy="/system/news_items/list_view_images/8622_1-PIA23764-RoverNamePlateonMars-320x240.jpg" src="/assets/loading_320x240.png"/>
</a>
</div>
<div class="content_title">
<a href="/news/8622/virginia-middle-school-student-earns-honor-of-naming-nasas-next-mars-rover/">
Virginia Middle School Student Earns Honor of 

In [7]:
# Pull news title based on the unique class 'content_title'
title_list = soup.find_all('div', class_='content_title')
news_title=title_list[0].text.strip()
news_title

'Mars Now'

In [8]:
# Pull news paragraph based on the unique class 'rollover_description_inner'
# results are returned as an iterable list
paragraph_list = soup.find_all('div', class_='rollover_description_inner')
news_p=paragraph_list[0].text.strip()
news_p

'A mission to investigate key questions about potential life on Mars.'

In [9]:
# Exit the current Chrome browser session
browser.quit()

### JPL Mars Space Images - Featured Image

In [10]:
# Render JPL website in Chrome adding some delay to settle display
browser  = get_chrome()
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)
time.sleep(2)

In [11]:
# Select the 'FULL IMAGE' button to get large high resolution image
browser.click_link_by_partial_text('FULL IMAGE')



In [12]:
# Find 'more_info' selector to get to the right level of hierarchy
browser.is_element_present_by_text("more info", wait_time=1)
mi_element = browser.find_link_by_partial_text("more info")
mi_element.click()

In [13]:
# Get HTML object and process it with Beautiful Soup
html = browser.html 
soup = bs(html, 'html.parser')
#print(soup.prettify())

In [14]:
# Continue scraping the image based on element 'figure' class_='lede'
image_url = soup.find('figure', class_='lede').a["href"]

In [15]:
# Update scraped image URL to create a featureed image URL
featured_image_url = f'https://www.jpl.nasa.gov{image_url}'
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA19323_hires.jpg'

In [16]:
# Exit the current Chrome browser session
browser.quit()

### Mars Weather

In [17]:
# Render JPL website in Chrome adding some delay to settle display
browser  = get_chrome()
url = "https://twitter.com/marswxreport?lang=en"
browser.visit(url)
time.sleep(2)

In [18]:
# Get HTML object and process it with Beautiful Soup
html = browser.html 
soup = bs(html, 'html.parser')
print(soup.prettify())

<html dir="ltr" lang="en" style="font-size: 15px;">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=0,viewport-fit=cover" name="viewport"/>
  <link href="//abs.twimg.com" rel="preconnect"/>
  <link href="//api.twitter.com" rel="preconnect"/>
  <link href="//pbs.twimg.com" rel="preconnect"/>
  <link href="//t.co" rel="preconnect"/>
  <link href="//video.twimg.com" rel="preconnect"/>
  <link href="//abs.twimg.com" rel="dns-prefetch"/>
  <link href="//api.twitter.com" rel="dns-prefetch"/>
  <link href="//pbs.twimg.com" rel="dns-prefetch"/>
  <link href="//t.co" rel="dns-prefetch"/>
  <link href="//video.twimg.com" rel="dns-prefetch"/>
  <link as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/web/polyfills.aa560c44.js" nonce="" rel="preload"/>
  <link as="script" crossorigin="anonymous" href="https://abs.twimg.com/responsive-web/web/vendors~main.55bd4704.js" nonce="" rel="preload"/>
  <link as=

In [19]:
tweet = soup.find('div', class_='css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0').find('span', class_='css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0').get_text()
tweet

'InSight sol 548 (2020-06-11) low -91.2ºC (-132.1ºF) high -2.8ºC (26.9ºF)\nwinds from the SW at 4.8 m/s (10.8 mph) gusting to 20.1 m/s (45.0 mph)\npressure at 7.40 hPa'

In [20]:
mars_weather = tweet.replace('\n', ' ')
mars_weather

'InSight sol 548 (2020-06-11) low -91.2ºC (-132.1ºF) high -2.8ºC (26.9ºF) winds from the SW at 4.8 m/s (10.8 mph) gusting to 20.1 m/s (45.0 mph) pressure at 7.40 hPa'

In [21]:
# Exit the current Chrome browser session
browser.quit()

### Mars Facts

In [22]:
# Render JPL website in Chrome adding some delay to settle display
browser  = get_chrome()
url = "https://space-facts.com/mars/"
browser.visit(url)
time.sleep(2)

In [23]:
# Get HTML object and process it with Beautiful Soup
html = browser.html 
soup = bs(html, 'html.parser')
print(soup.prettify())

<html lang="en-US">
 <head>
  <script src="https://www.googletagservices.com/activeview/js/current/osd.js?cb=%2Fr20100101">
  </script>
  <script src="https://partner.googleadservices.com/gampad/cookie.js?domain=space-facts.com&amp;callback=_gfp_s_&amp;client=ca-pub-4251889121233823">
  </script>
  <script id="google_shimpl" src="https://pagead2.googlesyndication.com/pagead/js/r20200608/r20190131/show_ads_impl_fy2019.js">
  </script>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="https://gmpg.org/xfn/11" rel="profile"/>
  <link href="https://space-facts.com/wp/xmlrpc.php" rel="pingback"/>
  <script>
   function fvmuag(){if(navigator.userAgent.match(/x11.*fox\/54|oid\s4.*xus.*ome\/62|oobot|ighth|tmetr|eadles|ingdo/i))return!1;if(navigator.userAgent.match(/x11.*ome\/75\.0\.3770\.100/i)){var e=screen.width,t=screen.height;if("number"==typeof e&&"number"==typeof t&&862==t&&1367==e)return!1}return!0}
  </script>
  <!-- Jetpack

In [24]:
# Access Mars facts table from HTML above, read into Pandas and process it
facts_table = pd.read_html(url)
facts_df = facts_table[0]
facts_df.columns = ["Category", "Measurement"]
facts_df = facts_df.set_index("Category")
facts_df

Unnamed: 0_level_0,Measurement
Category,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [25]:
# Export table above to HTML for further processing
mars_facts_table = facts_df.to_html()
mars_facts_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Measurement</th>\n    </tr>\n    <tr>\n      <th>Category</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </t

In [26]:
# Exit the current Chrome browser session
browser.quit()

### Mars Hemispheres

In [27]:
# Render Astropedia website in Chrome adding some delay to settle display
browser  = get_chrome()
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)
time.sleep(10)

In [28]:
# Get HTML object and process it with Beautiful Soup
html = browser.html 
soup = bs(html, 'html.parser')
print(soup.prettify())

<html lang="en">
 <head>
  <link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.3/themes/smoothness/jquery-ui.css" rel="stylesheet" type="text/css"/>
  <title>
   Astropedia Search Results | USGS Astrogeology Science Center
  </title>
  <meta content="USGS Astrogeology Science Center Astropedia search results." name="description"/>
  <meta content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM" name="google-site-verification"/>
  <!--<link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Open+Sans:400italic,400,bold"/>-->
  <link href="/css/main.css" media="screen" rel="stylesheet"/>
  <link href="/css/print.css" media="print" rel="styles

In [29]:
# Select HTML code corresponding to the four Marse Image items only
imgs = soup.find_all("div", class_="item")
imgs

[<div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/></a><div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div> <!-- end description --></div>,
 <div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/08eac6e22c07fb1fe72223a79252de20_schiapa

In [30]:
# Access the four Large High Resolution images by clicking a link corresponding to the corresponding image title
hemisphere_image_urls = []

for img in imgs:
    # Get a page title and acces that page
    title = img.find('h3').text
    browser.click_link_by_partial_text(title)
    time.sleep(20)

    # Get HTML object and process it with Beautiful Soup
    html = browser.html 
    single_soup = bs(html, 'html.parser')
    
    # Process this page by Beautiful Soup and build the complete dictionary with title and HTML link
    img_url = single_soup.find('img', class_='wide-image')['src']
    hemisphere_image_urls.append({"title" : title, "img_url" : f'https://astrogeology.usgs.gov{img_url}'})

    # Return to the original HTML link
    browser.visit(url)
    time.sleep(20)    
    
hemisphere_image_urls



[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [31]:
# Exit the current Chrome browser session
browser.quit()