In [1]:
from bs4 import BeautifulSoup as soup
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Set up Splinter (prepping our automated browser)
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [C:\Users\cdpet\.wdm\drivers\chromedriver\win32\98.0.4758.102\chromedriver.exe] found in cache


### Article Title and Summary

In [3]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=2)

True

In [4]:
# Parse the HTML and return an object containing all of the html
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')
type(slide_elem)

bs4.element.Tag

In [5]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

'With Mars Methane Mystery Unsolved, Curiosity Serves Scientists a New One: Oxygen'

In [6]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'For the first time in the history of space exploration, scientists have measured the seasonal changes in the gases that fill the air directly above the surface of Gale Crater on Mars. '

### Featured Images

In [7]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [8]:
# Find and click the full image button. Since a ctrl+F search in the devtools
# reveals there are only 3 button elements, we will select off the index 1
# button since the full image button is the second button element on the page
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [9]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [10]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars3.jpg'

### Table of Mars Information

In [11]:
df = pd.read_html('https://galaxyfacts-mars.com')[0]
df.columns=['Description', 'Mars', 'Earth']
df.drop([0], inplace=True)
df.set_index('Description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [12]:
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>\n</table>'

## D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

### Hemispheres

In [13]:
# 1. Use browser to visit the URL 
url = 'https://marshemispheres.com/'

browser.visit(url)

In [14]:
# 2. Create a list to hold the images and titles.
# Variable to store dictionaries of the image urls and their titles
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.
html = browser.html
hemisphere_soup = soup(html, 'html.parser')
# Retrieve all the containers that have the anchor tags for the hemisphere
# images.
containers = hemisphere_soup.find_all('div', class_='item')

for container in containers:
    # Find the title in the only h3 tag.
    title = container.find('h3').get_text()
    # Find the href in the first anchor tag, this is a relative url.
    next_rel_url = container.find('a').get('href')
    # Build the next url to visit.
    next_url = f'{url}{next_rel_url}'
    
    browser.visit(next_url)
    html = browser.html
    next_hemisphere_soup = soup(html, 'html.parser')
    # Find the container that has the full resolution image relative url.
    next_container = next_hemisphere_soup.find('div', class_="wide-image-wrapper")
    # Find the href for the full resolution image, this is a relative url.
    hemi_img_rel_url = next_container.find('a').get('href')
    # Build the full url and append the dictionary containing the image url and
    # corresponding title
    hemisphere_image_urls.append({'img_url': f'{url}{hemi_img_rel_url}',
                                  'title': title})

In [15]:
# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [16]:
# 5. Quit the browser
browser.quit()