Initial web scraping using Jupyter Notebook, Beautiful Soup, Pandas, and Requests/Splinter. Data will be scraped from:

1) Mars News Site: https://redplanetscience.com/

2) JPL Mars Space Images: https://spaceimages-mars.com/

3) Galaxy Facts: https://spaceimages-mars.com/

4) Mars Hemishperes: https://marshemispheres.com/

In [1]:
# #install splinter module
# !pip install splinter

In [2]:
# #install webdriver_manager module
# !pip install webdriver_manager

In [3]:
#Import Required Modules
# Automates browser actions
from splinter import Browser #allows computer to communicate directly with webpage/navigate. 
#You can also grab data

# Parses the HTML
from bs4 import BeautifulSoup as bs #improved functionality to grab specified data
import pandas as pd

# For scraping with Chrome
from webdriver_manager.chrome import ChromeDriverManager


WEB SCRAPE ONE: Collect lastest news titles and paragraph texts

In [4]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [5]:
# Url to scrape Mars News Site
url = "https://redplanetscience.com/"

# Call visit on browser and pass in the URL     
browser.visit(url)

#Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

# Set an empty dict for news items that will be saved to Mongo
news_items = {}
    
# Build dictionary for the titles and paragraphs text from scraped data   
news_items["Title"] = soup.find_all("div", class_="content_title")
news_items["Blurb"] = soup.find_all("div", class_="article_teaser_body")
    
# Close the browser after scraping
browser.quit()

In [6]:
news_items
#result is dictionary of lists (beautiful soup objects). 

{'Title': [<div class="content_title">NASA Invites Students to Name Mars 2020 Rover</div>,
  <div class="content_title">How NASA's Mars Helicopter Will Reach the Red Planet's Surface</div>,
  <div class="content_title">NASA's Mars Reconnaissance Orbiter Undergoes Memory Update</div>,
  <div class="content_title">All About the Laser (and Microphone) Atop Mars 2020, NASA's Next Rover</div>,
  <div class="content_title">NASA's New Mars Rover Will Use X-Rays to Hunt Fossils</div>,
  <div class="content_title">NASA's Treasure Map for Water Ice on Mars</div>,
  <div class="content_title">10.9 Million Names Now Aboard NASA's Perseverance Mars Rover</div>,
  <div class="content_title">A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes</div>,
  <div class="content_title">NASA Establishes Board to Initially Review Mars Sample Return Plans</div>,
  <div class="content_title">InSight's 'Mole' Team Peers into the Pit</div>,
  <div class="content_title">From JPL's Mailroom to Mars and Beyon

    WEB SCRAPE TWO: Grab current featured image of Mars from JPL Mars Space Images

In [7]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [8]:
# URL to grab current Featured Mars Image
space_images_url = "https://spaceimages-mars.com/"

# Call visit on browser and pass in the URL     
browser.visit(space_images_url)

In [9]:
#navigate to full image of current featured image
full_image = browser.find_by_tag("button")[1]
full_image.click()


#Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

#generate final image url
relative_image_path = soup.find("img", class_="headerimage fade-in")['src']
featured_image_url = space_images_url + relative_image_path

# Close the browser after scraping
browser.quit()

In [10]:
#view final image url
featured_image_url


'https://spaceimages-mars.com/image/featured/mars1.jpg'

    WEB SCRAPE THREE:  Scrape table of Mars facts including Diameter, Mass, etc. from Mars Facts webpage

In [11]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [12]:
# Url to scrape Mars Facts table
url = "https://galaxyfacts-mars.com/"

# Call visit on browser and pass in the URL     
browser.visit(url)

#Use Pandas to parse table from URL. Acquires all tables on a page (2 tables included in generated list)
facts_tables = pd.read_html(url)
facts_tables


[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [13]:
#Build df with isolated data from Mars table (not data from Mars-Earth comparison)
mars_facts_table = facts_tables[1]
mars_facts_table = mars_facts_table.drop([mars_facts_table.index[0]])
mars_facts_table.columns = ['Fact Category', 'Response']
mars_facts_table

Unnamed: 0,Fact Category,Response
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 ( Phobos & Deimos )
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [14]:
#Use Pandas to convert data to HTML table string
html_table = mars_facts_table.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Fact Category</th>\n      <th>Response</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 ( Phobos &amp; Deimos )</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>Recorded By:</td>\n      <td>Egyptia

In [15]:
# Close the browser after scraping
browser.quit()

    WEB SCRAPE FOUR: Grab high resolution images for each of Mars' hemispheres


In [16]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [17]:
# Url to scrape high resolution Mars imagery

url = "https://marshemispheres.com/"

# Call visit on browser and pass in the URL     
browser.visit(url)

#Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

# Set an empty dict for news items that will be saved to Mongo
mars_hem_image_urls = []
    
#locate class that includes images for each hemisphere
mars_hemispheres = soup.find_all("div", class_="item")

#loop through above class to pull out data for each hemisphere
for hemisphere in mars_hemispheres:
    title = soup.find('h3')
    img_url = soup.find("img", class_="thumb")['src']
    mars_hem_image_urls.append({'title': title,'image_url': img_url})
    

In [18]:
mars_hem_image_urls 

[{'title': <h3>Cerberus Hemisphere Enhanced</h3>,
  'image_url': 'images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png'},
 {'title': <h3>Cerberus Hemisphere Enhanced</h3>,
  'image_url': 'images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png'},
 {'title': <h3>Cerberus Hemisphere Enhanced</h3>,
  'image_url': 'images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png'},
 {'title': <h3>Cerberus Hemisphere Enhanced</h3>,
  'image_url': 'images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png'}]

In [19]:
# Close the browser after scraping
browser.quit()