## Step 1 - Scraping

In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
import os
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pymongo

### NASA Mars News

- Scrape the Mars News Site (https://redplanetscience.com/)
- Collect the latest News Title and Paragraph Text
- Assign the text to variables that you can reference later.

In [2]:
# Set up executable path to browser with webdriver_manager
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - 

[WDM] - Current google-chrome version is 94.0.4606
[WDM] - Get LATEST driver version for 94.0.4606
[WDM] - Driver [C:\Users\Angela\.wdm\drivers\chromedriver\win32\94.0.4606.61\chromedriver.exe] found in cache


In [3]:
# Navigate to Mars News page
browser.visit('https://redplanetscience.com/')

# Create HTML object
html = browser.html

# Create BS object, parse with html.parser
page = bs(html, 'html.parser')

# Find News Title (class = content_title), assign to variable
news_title = page.find('div', class_='content_title').text
print(news_title)

# Find Paragraph Text (class = article_teaser_body), assign to variable
news_p = page.find('div', class_='article_teaser_body').text
print(news_p)

My Culture, My Voice
In honor of Hispanic Heritage Month, Christina Hernandez, an instrument engineer on the Mars 2020 mission, talks about her childhood and journey to NASA.


### JPL Mars Space Images - Featured Image 
- Visit the Featured Space Image site (https://spaceimages-mars.com/)
- Use splinter to navigate the site and find the image url for the current Featured Mars Image, and assign the url string to a variable called featured_image_url
- Make sure to find the image url to the full size .jpg image, and save its complete url string

In [4]:
# Navigate to Featured Space Image page
browser.visit('https://spaceimages-mars.com/')

# Create HTML object
html = browser.html

# Create BS object, parse with html.parser
page = bs(html, 'html.parser')

# Find current Featured Mars Image (class = headerimage), assign to variable
images = page.find('img', class_='headerimage fade-in')['src']
print(images)

featured_image_url = f'https://spaceimages-mars.com/{images}'
print(featured_image_url)

image/featured/mars3.jpg
https://spaceimages-mars.com/image/featured/mars3.jpg


### Mars Facts 
- Visit the Mars Facts webpage (https://galaxyfacts-mars.com/)
- Use Pandas to scrape the table containing facts about the planet, including Diameter, Mass, etc.
- Use Pandas to convert the data to a HTML table string.

In [5]:
# Use pd.read_html to scrape the tables from the Mars Facts webpage into a "tables" variable
tables = pd.read_html("https://galaxyfacts-mars.com/", header=0)
# uncomment to display
#print(tables)

# Set the first table into a DF 
mars_facts_df = tables[0]
mars_facts_df.set_index("Mars - Earth Comparison", inplace=True) 
print(mars_facts_df)

# Use .to_html() to convert DF to html table string
mars_facts_html = mars_facts_df.to_html()
mars_facts_html

                                    Mars            Earth
Mars - Earth Comparison                                  
Diameter:                       6,779 km        12,742 km
Mass:                    6.39 × 10^23 kg  5.97 × 10^24 kg
Moons:                                 2                1
Distance from Sun:        227,943,824 km   149,598,262 km
Length of Year:           687 Earth days      365.24 days
Temperature:                -87 to -5 °C      -88 to 58°C


'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>\n</table>'

### Mars Hemispheres
- Visit the astrogeology site (https://marshemispheres.com/) to obtain high resolution images for each of Mars' hemispheres
- Click each of the links to the hemispheres to find the image url to the full res image.
- Save the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name -- Use a Python dictionary to store the data using the keys "img_url" and "title"
- Append the dictionary with the image url string and the hemisphere title to a list (will contain one dictionary for each hemisphere).

In [6]:
# Navigate to Astrogeology page
browser.visit('https://marshemispheres.com/')

# Create HTML object
html = browser.html

# Create BS object, parse with html.parser
page = bs(html, 'html.parser')

# Locate each of the four hemisphere sections in the html 
locations = page.find_all("div", class_="description")
# uncomment to display
#print(locations)

# Create empty list to hold the img_url/title dictionaries
hemisphere_urls_titles_list = []

# Run through the html section locations for each hemisphere...
for location in locations:
    
    # Find the link to the full res image
    img_link = location.find('a')['href']
    
    # Navigate to the page for full res image, create BS object to parse through
    browser.visit('https://marshemispheres.com/' + img_link)
    html = browser.html
    page = bs(html, 'html.parser')
    
    # Find the image source link (img section, class 'wide-image', save full img_url
    img_src = page.find("img", class_='wide-image')["src"]
    img_url = 'https://marshemispheres.com/' + img_src
    
    # Find the title for the image (h2 within the div section, class 'cover')
    # --strip the h2 text, split off the last word with rsplit(), save the hemisphere title
    hemisphere_title = page.find("div", class_="cover").find('h2').text.strip().rsplit(' ', 1)[0] 
    
    # uncomment to display
    #print(hemisphere_title)
    #print(img_url)
    
    # Create the hemisphere dictionary (title: ... , img_url: ...), append to dictionary list
    hemisphere_dict = {'title': hemisphere_title, 'img_url': img_url}
    hemisphere_urls_titles_list.append(hemisphere_dict)
    
hemisphere_urls_titles_list

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [7]:
browser.quit()