Initial web scraping using Jupyter Notebook, Beautiful Soup, Pandas, and Requests/Splinter. Data will be scraped from:

1) Mars News Site: https://redplanetscience.com/

2) JPL Mars Space Images: https://spaceimages-mars.com/

3) Galaxy Facts: https://spaceimages-mars.com/

4) Mars Hemishperes: https://marshemispheres.com/

In [1]:
# #install splinter module
# !pip install splinter

In [2]:
# #install webdriver_manager module
# !pip install webdriver_manager

In [3]:
#Import Required Modules
# Automates browser actions
from splinter import Browser #allows computer to communicate directly with webpage/navigate. 
#You can also grab data

# Parses the HTML
from bs4 import BeautifulSoup as bs #improved functionality to grab specified data
import pandas as pd

# For scraping with Chrome
from webdriver_manager.chrome import ChromeDriverManager


WEB SCRAPE ONE: Collect lastest news titles and paragraph texts

In [4]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [5]:
# Url to scrape Mars News Site
url = "https://redplanetscience.com/"

# Call visit on browser and pass in the URL     
browser.visit(url)

#Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

# Set an empty dict for news items that will be saved to Mongo
news_items = {}
    
# Build dictionary for the titles and paragraphs text from scraped data   
news_items["Title"] = soup.find_all("div", class_="content_title")
news_items["Blurb"] = soup.find_all("div", class_="article_teaser_body")
    
# Close the browser after scraping
browser.quit()

In [6]:
news_items
#result is dictionary of lists (beautiful soup objects). 

{'Title': [<div class="content_title">Independent Review Indicates NASA Prepared for Mars Sample Return Campaign</div>,
  <div class="content_title">NASA Establishes Board to Initially Review Mars Sample Return Plans</div>,
  <div class="content_title">7 Things to Know About the Mars 2020 Perseverance Rover Mission</div>,
  <div class="content_title">NASA's Mars Helicopter Attached to Mars 2020 Rover </div>,
  <div class="content_title">What's Mars Solar Conjunction, and Why Does It Matter?</div>,
  <div class="content_title">Join NASA for the Launch of the Mars 2020 Perseverance Rover</div>,
  <div class="content_title">Alabama High School Student Names NASA's Mars Helicopter</div>,
  <div class="content_title">NASA to Hold Mars 2020 Perseverance Rover Launch Briefing</div>,
  <div class="content_title">NASA's Perseverance Rover Attached to Atlas V Rocket</div>,
  <div class="content_title">NASA Updates Mars 2020 Mission Environmental Review</div>,
  <div class="content_title">Mars 20

    WEB SCRAPE TWO: Grab current featured image of Mars from JPL Mars Space Images

In [17]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [18]:
# URL to grab current Featured Mars Image
space_images_url = "https://spaceimages-mars.com/"

# Call visit on browser and pass in the URL     
browser.visit(space_images_url)

In [19]:
#navigate to full image of current featured image
full_image = browser.find_by_tag("button")[1]
full_image.click()


#Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

#generate final image url
relative_image_path = soup.find("img", class_="headerimage fade-in")['src']
featured_image_url = space_images_url + relative_image_path

# Close the browser after scraping
browser.quit()

<html class="fancybox-margin fancybox-lock"><head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet"/>
<!-- <link rel="stylesheet" type="text/css" href="css/font.css"> -->
<link href="css/app.css" rel="stylesheet" type="text/css"/>
<link href="https://stackpath.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
<title>Space Image</title>
<style type="text/css">.fancybox-margin{margin-right:15px;}</style></head>
<body>
<div class="header">
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="#"><img id="logo" src="image/nasa.png"/><span class="logo">Jet Propulsion Laboratory</span>
<span class="logo1">California Institute of Technology</span></a>
<button aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation" class="navbar-toggler" data-target="#navbarNav" data-t

In [22]:
#view final image url
featured_image_url


'https://spaceimages-mars.com/image/featured/mars3.jpg'

    WEB SCRAPE THREE:  Scrape table of Mars facts including Diameter, Mass, etc. from Mars Facts webpage

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Url to scrape Mars Facts table
url = "https://galaxyfacts-mars.com/"

# Call visit on browser and pass in the URL     
browser.visit(url)

#Use Pandas to parse table from URL. Acquires all tables on a page (2 tables included in generated list)
facts_tables = pd.read_html(url)
facts_tables


In [None]:
#Build df with isolated data from Mars table (not data from Mars-Earth comparison)
mars_facts_table = facts_tables[1]
mars_facts_table = mars_facts_table.drop([mars_facts_table.index[0]])
mars_facts_table.columns = ['Fact Category', 'Response']
mars_facts_table

In [None]:
#Use Pandas to convert data to HTML table string
html_table = mars_facts_table.to_html()
html_table

In [None]:
# Close the browser after scraping
browser.quit()

    WEB SCRAPE FOUR: Grab high resolution images for each of Mars' hemispheres


In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()} #chrome driver manager 
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Url to scrape high resolution Mars imagery

url = "https://marshemispheres.com/"

# Call visit on browser and pass in the URL     
browser.visit(url)

#Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

# Set an empty dict for news items that will be saved to Mongo
mars_hem_image_urls = []
    
#locate class that includes images for each hemisphere
mars_hemispheres = soup.find_all("div", class_="item")

#loop through above class to pull out data for each hemisphere
for hemisphere in mars_hemispheres:
    title = soup.find('h3')
    img_url = soup.find("img", class_="thumb")['src']
    mars_hem_image_urls.append({'title': title,'image_url': img_url})
    

In [None]:
mars_hem_image_urls 

In [None]:
# Close the browser after scraping
browser.quit()