In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

In [2]:
# start up the browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
# dictionary to hold all the things we need to scrape for
mars_data ={}

## Scrape NASA for latest Mars article

In [4]:
# scrape NASA mars news site to get latest news
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
browser.visit(url)
sleep(2)

In [5]:
# parse with beautiful soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())

In [6]:
# get the latest article (first on the page)
article = soup.find('li', class_='slide')

# pull news title and news teaster paragraph from article
news_title = article.find('div', class_='content_title').a.text
news_p = article.find('div', class_='article_teaser_body').text

#add to dictionary
mars_data['news_title'] = news_title
mars_data['news_paragraph'] = news_title

## Scrape JPL to get latest Mars image

In [7]:
# scrape JPL site to get latest Mars image
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
url_root = 'https://www.jpl.nasa.gov'
sleep(2)

In [8]:
# parse with beautiful soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())

In [9]:
# have splinter cick the Full Image button
full_image_button = browser.find_by_id("full_image")
full_image_button.click()
sleep(2)

In [10]:
# have splinter find the More Info button and click it
more_info_element = browser.find_link_by_partial_text("more info")
more_info_element.click()
sleep(2)

In [11]:
# parse with beautiful soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [12]:
url_img = soup.find('figure', class_='lede').a['href']
feature_img_url = f'{url_root}{url_img}'

# add feature image url to dictionary
mars_data['feature_img_url'] = feature_img_url

## Get Mars Weather from tweet

In [13]:
# scrape twitter site to get latest Mars weather
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
sleep(2)

In [14]:
# parse with beautiful soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())

In [15]:
tweet = soup.find('p', class_='tweet-text')
# tweet

In [16]:
# tweet has p tage with weather info and anchor tag inside text of p tag.
# let's use the the strings met
mars_weather = list(tweet.strings)[0]

# add to dictionary
mars_data['mars_weather'] = mars_weather

## Get Mars Facts

In [17]:
# scrape space facts site to get Mars facts
url = 'https://space-facts.com/mars/'
browser.visit(url)
sleep(2)

In [18]:
# use the pandas read html method to scrape page for all tables
tables = pd.read_html(url)
# tables

In [19]:
# the second table has the info we want. let's stick it into a dataframe and format all pretty
columns = ['Description', 'Value']
tables[1].columns = columns
mars_facts_html = tables[1].to_html(index=False)
mars_facts_html

# add to dictionary
mars_data['mars_facts_html'] = mars_facts_html

## Get Mars Hemisphere Images

In [20]:
# let's go to page with hemispheres
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [21]:
# create list to hold on to stuff
hemisphere_image_urls = []

In [22]:
# get list of thumbnails to loop through
img_thumbnails = browser.find_by_css('img[class="thumb"]')

index = 0
# loop through thumbnails using index so the links don't get stale when flipping back
while index < len(img_thumbnails):
    img_thumbnail = browser.find_by_css('img[class="thumb"]')[index]
    
    # click link
    img_thumbnail.click()
    
    # let the dang page load
    sleep(1)
    
    # parse with beautiful soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # get the image url & title
    img_url = soup.find('a', text='Sample')['href']
    title = soup.find('h2').text
    
    # create dictionary and add to list
    hemi_dict = {}
    hemi_dict['title'] = title.replace('Enhanced', '')
    hemi_dict['img_url'] = img_url
    hemisphere_image_urls.append(hemi_dict)
    
    # increment index
    index +=1
    
    # have browser go back a page to get back to list
    browser.back()
    
    # wait a second to load the page
    sleep(1)
    
    

In [23]:
# add to dictionary
mars_data['hemisphere_image_urls'] = hemisphere_image_urls


## Read dictionary of all scrapes!

In [24]:
#close browser
browser.quit()

# look at the scrapes!
mars_data

{'news_title': "HiRISE Views NASA's InSight and Curiosity on Mars",
 'news_paragraph': "HiRISE Views NASA's InSight and Curiosity on Mars",
 'feature_img_url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA19048_hires.jpg',
 'mars_weather': 'InSight sol 314 (2019-10-15) low -102.8ºC (-153.0ºF) high -24.8ºC (-12.6ºF)\nwinds from the SSE at 4.9 m/s (11.1 mph) gusting to 19.7 m/s (44.0 mph)\npressure at 7.20 hPa',
 'mars_facts_html': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>Description</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      