# Web Scraping Homework - Mission to Mars

=================================================================================


In [1]:
# Import dependencies

import pandas as pd
from bs4 import BeautifulSoup
import requests
from splinter import Browser

In [2]:
# Create executable path for chromedriver (Windows)

executable_path = {'executable_path': 'chromedriver.exe'}

# For Mac Users:
# executable_path = {"executable_path": "/usr/local/bin/chromedriver"}

browser = Browser('chrome', **executable_path, headless=False)


## NASA Mars News

Scrape the NASA Mars News Site (https://mars.nasa.gov/news) and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.


In [3]:
# Define the URL that will be scraped

nasa_url = "https://mars.nasa.gov/news"

In [4]:
# Use splinter and beautiful soup to get HTML data

browser.visit(nasa_url)
nasa_html = browser.html
nasa_soup = BeautifulSoup(nasa_html, 'html.parser')

In [5]:
# Find the most recent news title
# From the inspect HTML, the title is in the "div" tag with class "content_title." 
# Get the text between the "a" tag

news_title = nasa_soup.find("div", class_="content_title").find("a").text

# Display the title to make sure correct text was scraped

news_title

"NASA's Mars 2020 Will Hunt for Microscopic Fossils"

In [6]:
# Get the paragraph text
# The text is found in the tag "div" with class "article_teaser_body"

news_p = nasa_soup.find("div", class_="article_teaser_body").text

# Display the text to make sure correct info was scraped

news_p

"A new paper identifies a ring of minerals at the rover's landing site that are ideal for fossilizing microbial life."

## JPL Mars Space Images - Featured Image

- Use splinter to navigate the JPL Featured Space Image site (https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars) and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.
- Make sure to find the image url to the full size .jpg image.
- Make sure to save a complete url string for this image.


In [26]:
# Define the URL that will be scraped

mars_images_url_page = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

# Use splinter and beautiful soup to get HTML data

browser.visit(mars_images_url_page)
image_html = browser.html
image_soup = BeautifulSoup(image_html, 'html.parser')

In [33]:
# Find the image data
# The entire info is found in the "a" tag with class "button fancybox"

image_info = image_soup.find("a", class_="button fancybox")

# Display the info

image_info

<a class="button fancybox" data-description="This image shows the Large Magellanic Cloud galaxy in infrared light as seen by ESA's Herschel Space Observatory and NASA's Spitzer Space Telescope. The brightest center-left region is called 30 Doradus, or the Tarantula Nebula." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA15254_ip.jpg" data-link="/spaceimages/details.php?id=PIA15254" data-title="Dusty Space Cloud" id="full_image">
					FULL IMAGE
				  </a>

In [39]:
# Get the specific URL
# URL data is found in the attribute "data-fancybox-href"

image_med_size = image_info.attrs["data-fancybox-href"]

# Print the partial URL

print(image_med_size)

/spaceimages/images/mediumsize/PIA15254_ip.jpg


In [41]:
# Since we want the large size image, replace the "mediumsize" with "largesize"
# Also replace the "_ip" with "_hires" to get the correct URL

image_large_size = (image_med_size.replace("mediumsize", "largesize")).replace("_ip", "_hires")

# Get the complete image URL

featured_image_url = "https://www.jpl.nasa.gov" + image_large_size

# Print the URL to check if data is correct

print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA15254_hires.jpg


## Mars Weather

Visit the Mars Weather twitter account (https://twitter.com/marswxreport) and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather.

In [42]:
# Define the URL that will be scraped

twitter_url = "https://twitter.com/marswxreport?lang=en"

# Use splinter and beautiful soup to get HTML data

browser.visit(twitter_url)
twitter_html = browser.html
twitter_soup = BeautifulSoup(twitter_html, 'html.parser')

In [89]:
# Get the most recent twitter data about Mars weather
# The text is found between the "p" tag with class "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"

twitter_info = twitter_soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text

# Print the data to check info

print(twitter_info)

InSight sol 340 (2019-11-10) low -101.0ºC (-149.8ºF) high -24.4ºC (-12.0ºF)
winds from the SSE at 5.3 m/s (11.9 mph) gusting to 20.4 m/s (45.6 mph)
pressure at 6.90 hPapic.twitter.com/5m82oQ9M6z


## Mars Facts

- Visit the Mars Facts webpage (https://space-facts.com/mars) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
- Use Pandas to convert the data to a HTML table string.

In [64]:
# Define the URL that will be scraped

mars_facts_url = "https://space-facts.com/mars/"

# Use pandas to read as HTML

mars_data = pd.read_html(mars_facts_url)

# Preview the data

mars_data[0:3]

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:    -153 to 20 °C      -88 to 58°C,
           

In [65]:
# Get only the first dictionary data

mars_data_df = mars_data[0]

# Display the data to verify correct data was collected

mars_data_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [66]:
# Change the column names to the appropriate titles

mars_data_df.columns=["Description", "Value"]

# Check to make sure columns were changed

mars_data_df

Unnamed: 0,Description,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [67]:
# Set the index the the "Description" data

mars_data_df.set_index("Description", inplace=True)

# Make sure data frame is correct

mars_data_df

Unnamed: 0_level_0,Value
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [68]:
# Convert the data frame to HTML

mars_table = mars_data_df.to_html()

mars_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Value</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [None]:
mars_table = mars_table.replace("\n", "")

## Mars Hemispheres

- Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres.
- You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
- Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.
- Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [71]:
# Define the URL and 

mars_hemi_images_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(mars_hemi_images_url)

# Use splinter and beautiful soup to get HTML data

mars_hemi_html = browser.html
mars_hemi_soup = BeautifulSoup(mars_hemi_html, "html.parser")

# Find the HTML for the hemisphere images to get the image URL info

mars_hemis = mars_hemi_soup.find_all("div", class_="item")

# Display the HTML

mars_hemis

[<div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/dfaf3849e74bf973b59eb50dab52b583_cerberus_enhanced.tif_thumb.png"/></a><div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div> <!-- end description --></div>,
 <div class="item"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="/cache/images/7677c0a006b83871b5a2f66985ab5857_schiapa

In [75]:
# Define the main URL for the hemispheres

main_url = "https://astrogeology.usgs.gov"

# Get the text for the name of the hemisphere

hemi = mars_hemis[0].find("h3").text

# Check to see if correct data was scraped

hemi

'Cerberus Hemisphere Enhanced'

In [77]:
# Find the partial URL for the hemisphere

partial_url =  mars_hemis[0].find("div", class_="description").a["href"]

# Display the URL

partial_url

'/search/map/Mars/Viking/cerberus_enhanced'

In [78]:
# Create the URL by combining the base URL withe the partial URL found above

hemi_url = main_url + partial_url

# Check to make sure URL is correct

hemi_url

'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'

In [79]:
# Use splinter and beautfiul soup to get HTML data of URL created above

browser.visit(hemi_url)
image_html = browser.html
image_soup = BeautifulSoup(image_html, "html.parser")

# Find the partial image URL for the hemisphere

partial_image_url = image_soup.find("img", class_="wide-image")["src"]

# Display the partial URL

partial_image_url

'/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'

In [80]:
# Get the main image URL by adding the partial URL to the base URL

image_url = main_url + partial_image_url

# Check to see if URL is correct

image_url

'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'

In [83]:
# From the above code, create a loop to get info for all Mars hemispheres
# Create an empty list to hold the data 

hemisphere_image_urls  = []

# Create a loop to find all the hemisphere names and image URLs

for hemisphere in mars_hemis:
    
    # Define the main URL
    
    main_url = "https://astrogeology.usgs.gov"
    
    # Create an empty dictionary to hold the data collected
    
    mars_hemi_dict = {}
    
    # Get the name of the hemisphere
    
    hemi = hemisphere.find("h3").text
    partial_url = hemisphere.find("a", class_="itemLink product-item")["href"]
    
    # Get the URL for the image
    
    hemi_url = main_url + partial_url
    browser.visit(hemi_url)
    image_html = browser.html
    image_soup = BeautifulSoup(image_html, "html.parser")
    partial_image_url = image_soup.find("img", class_="wide-image")["src"]
    
    # Add the data to the empty dictionary created above
    
    mars_hemi_dict["Title"] = hemi
    mars_hemi_dict["img_url"] = main_url + partial_image_url
    
    # Append each dictionary to the empty list hemisphere_image_urls
    
    hemisphere_image_urls.append(mars_hemi_dict)
    
# Display the list

hemisphere_image_urls    

[{'Title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'Title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'Title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'Title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]