In [1]:
# Dependencies
import warnings
warnings.filterwarnings('ignore')
from splinter import Browser
import pymongo
import datetime
from bs4 import BeautifulSoup as bs
import requests
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

## Nasa Mars News Scraping

In [3]:
# Define database and collection
nasa_db = client.news_db
collection = nasa_db.articles

In [4]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [5]:
# URL of Nasa Mars News page to be scraped
nasa_url = 'https://redplanetscience.com/'
browser.visit(nasa_url)

nasa_html = browser.html

r = requests.get('https://redplanetscience.com/', timeout=(3))

# Create BeautifulSoup object; parse with 'html.parser'
nasa_soup = bs(nasa_html, 'html.parser')
print(nasa_soup.prettify())

<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta1/dist/css/bootstrap.min.css" integrity="sha384-giJF6kkoqNQ00vy+HMDP7azOuL0xtbfIcaT9wjKHr8RbDVddVHyTfAAsrekwKmP1" rel="stylesheet"/>
  <link href="css/font.css" rel="stylesheet" type="text/css"/>
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link crossorigin="anonymous" href="https://pro.fontawesome.com/releases/v5.10.0/css/all.css" integrity="sha384-AYmEC3Yw5cVb3ZcuHtOA93w35dYTsvhLPVnYs9eStHfGJvOvKxVfELGroGkvsg+p" rel="stylesheet"/>
  <title>
   News - Mars Exploration Program
  </title>
 </head>
 <body>
  <div class="col-md-12">
   <div class="row">
    <nav class="navbar navbar-expand-lg navbar-light fixed-top">
     <div class="container-fluid">
      <a class="navbar-brand" href="#">
       <img src="image/nasa.png" width="80"/>
       <span class="logo">
        MA

In [6]:
# Examine the results and look for a div with the class 'content_title'
result = nasa_soup.find('div', class_='content_title')
result

<div class="content_title">NASA's Perseverance Rover Bringing 3D-Printed Metal Parts to Mars</div>

In [7]:
# Convert the first tag to title text
news_title = result.get_text()
news_title

"NASA's Perseverance Rover Bringing 3D-Printed Metal Parts to Mars"

In [8]:
# Examine the results and look for a div with the class 'article_teaser_body'
p = nasa_soup.find('div', class_='article_teaser_body')
p

<div class="article_teaser_body">For hobbyists and makers, 3D printing expands creative possibilities; for specialized engineers, it's also key to next-generation spacecraft design.</div>

In [9]:
# Convert the first tag to paragraph text
news_p = p.get_text()
news_p

"For hobbyists and makers, 3D printing expands creative possibilities; for specialized engineers, it's also key to next-generation spacecraft design."

## JPL Mars Space Images—Featured Image

In [10]:
# URL of Mars Space Image page to be scraped
space_url = 'https://spaceimages-mars.com/'
browser.visit(space_url)

In [11]:
#find and click the full image button
mars_image_link = browser.find_by_tag('button')[1]
mars_image_link.click()

In [12]:
# Parse HTML with Beautiful Soup
html = browser.html
mars_image_soup = bs(html, 'html.parser')
print(mars_image_soup.prettify())

<html class="fancybox-margin fancybox-lock">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet"/>
  <!-- <link rel="stylesheet" type="text/css" href="css/font.css"> -->
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link href="https://stackpath.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
  <title>
   Space Image
  </title>
  <style type="text/css">
   .fancybox-margin{margin-right:0px;}
  </style>
 </head>
 <body>
  <div class="header">
   <nav class="navbar navbar-expand-lg">
    <a class="navbar-brand" href="#">
     <img id="logo" src="image/nasa.png"/>
     <span class="logo">
      Jet Propulsion Laboratory
     </span>
     <span class="logo1">
      California Institute of Technology
     </span>
    </a>
    <button aria-controls="navbarNav" aria-expande

In [13]:
mars_image_rel = mars_image_soup.find('img', class_ = 'fancybox-image').get('src')

In [14]:
#find the relative image url
mars_image_rel

'image/featured/mars2.jpg'

In [15]:
#Use the base url to create am absolute url
mars_image_url = f'https://spaceimages-mars.com/{mars_image_rel}'
mars_image_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

# Mars Facts

In [16]:
url = 'https://galaxyfacts-mars.com/'

In [17]:
#use the read_html function in Pandas to automatically scrape any tabular data from a page.
tables = pd.read_html(url)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [18]:
df = tables[0]
df.head(20)

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [19]:
#add names to columns
df.columns = ['Description', 'Mars', 'Earth']
df

Unnamed: 0,Description,Mars,Earth
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [20]:
#Drop index
df.set_index('Description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [21]:
df.to_html('table.html')

In [22]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html

# Mars Hemispheres

In [46]:
hemispheres_url = 'https://marshemispheres.com/'
browser.visit(hemispheres_url)

In [47]:
#Create a list to hold images and titles
hemispheres_image_urls = []

#Get a list of all the hemispheres
hemispheres_list = browser.find_by_css('a.product-item img')

#Loop through the links, click each link, find the sample anchor, and return the href
for i in range(len(hemispheres_list)):
    #hemisphere info dictionary 
    hemisphere_info = {}
    browser.find_by_css('a.product-item img')[i].click()
    #find the sample anchor tag and extract the href
    sample_anchor = browser.links.find_by_text('Sample').first
    hemisphere_info["img_url"] = sample_anchor['href']
    #get the hemisphere titles
    hemisphere_info["title"] = browser.find_by_css('h2.title').text
    #append urls to list
    hemispheres_image_urls.append(hemisphere_info)

    #navigate back to homepage
    browser.back()

In [48]:
hemispheres_image_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [49]:
browser.quit

<bound method BaseWebDriver.quit of <splinter.driver.webdriver.chrome.WebDriver object at 0x7feb85550b80>>