In [1]:
# import all the relevant libraries you may use
import pandas as pd
import os
import requests
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
import datetime as dt
from webdriver_manager.chrome import ChromeDriverManager

# Scraping Mars Data

## NASA Mars News

In [2]:
# We will be scraping information from the NASA website regarding Mars
# Define the the URL
url = 'https://mars.nasa.gov/news'

# Retrive the page with the requests module
response = requests.get(url)
# Convert the response to text to obtain the html
html = response.text

In [3]:
# Establish chrome driver executable path. Make sure to define actual location on your drive.
executable_path ={'executable_path': 'C:/Users/NAVID/chromedriver.exe'}
# Open a splinter browser
browser = Browser('chrome', **executable_path, headless=False)

In [4]:
# Visit the defined URL on your splinter broswers
browser.visit(url)

In [5]:
# Create a BeautifulSoup object with the splinter broswer.html object and parse the html with 'html.parser' or 'lxml'
soup = bs(browser.html, 'html.parser')

In [6]:
# Scrape the first instance of latest news title text and assign to a variable
# Find the first article
first_news_article = soup.find('li', class_="slide")

# Find the title within that article summary and convert into .text or .get_text() and then .strip() of '/n'
news_title = first_news_article.find('div', class_='content_title').text.strip()

In [7]:
news_title

'NASA to Provide Update on InSight Mars Lander'

In [8]:
# Save the article link url
article_link_string = first_news_article.find('a')['href']
article_url = url + article_link_string
article_url

'https://mars.nasa.gov/news/news/9188/nasa-to-provide-update-on-insight-mars-lander/'

In [9]:
# Scrape the first instance of latest paragraph text and assign to a variable
# Find the paragraph within that article summary and convert into .text and then .strip() of '/n'
news_p = first_news_article.find('div', class_="article_teaser_body").text.strip()

In [10]:
news_p

'NASA and InSight leaders will share the latest on the pioneering spacecraft’s science findings and discuss future milestones for the mission.'

## JPL Mars Space Images - Featured Image

In [11]:
# Open a splinter browser to scrape the desired images
# Define the URL path
url_2 = 'https://spaceimages-mars.com'

# Using the already established splinter engine, open the url in broswer
# Visit the defined URL on your splinter broswers
browser.visit(url_2)

# delay action until browser loads
time.sleep(10)

# click on the sprinter browser link 'FULL IMAGE' to see the image we want to store
browser.links.find_by_partial_text('FULL IMAGE')

<splinter.element_list.ElementList at 0x1901aa3f340>

In [12]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [15]:
# Soupify the browser html
html = browser.html
img_soup = bs(html, 'html.parser')


In [16]:
# Locate the 'div' and class attribute where the image is found and 
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars2.jpg'

In [17]:
# Use the base url to create an absolute url
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

## Mars Facts

In [None]:
# We will use Pandas to scrape the table information from the space-fact.com website on Mars
# Define the url
url = 'https://space-facts.com/mars/'

In [None]:
# Using pd.read_html() will pull a list dataframes of all the tables
tables = pd.read_html(url)
tables


In [None]:
# We want to slice off the 1st table from the list
mars_facts_tbl = tables[0]
mars_facts_tbl

In [None]:
mars_facts_tbl.columns = ['Attribute','Values']
mars_facts_tbl

# Set the index to the Atrributes column
mars_facts_tbl.set_index('Attribute', inplace=True)
mars_facts_tbl

In [None]:
html_mars_tbl = mars_facts_tbl.to_html()
html_mars_tbl

## Mars Hemispheres

In [None]:
# Visit URL
url_3 = 'https://marshemispheres.com/'

In [None]:
# browse the url
# Visit the defined URL on your splinter broswers
browser.visit(url_3)

In [None]:
# Soupify 
html = browser.html
soup = bs(html, 'html.parser')

In [None]:
# Create an empty list to store dictonary values for the keys of 'image_url' and 'title'
hemisphere_image_list = []

In [None]:
# Use .find_all() to slice out the html we will loop through to visit different webpages and scrape the data
image_links = soup.find_all('div', class_='item')
image_links
# browser.find_all()

In [None]:
for item in image_links:
    
    # Find the url link string from the 'a' tag and call the 'href' string
    link = item.find('a')['href']
    
    # Combine the root url from above and the link url
    url_4 = url_3 + link
    
    # Open the splinter browser using the url_4 link we just created
    browser.visit(url_4)

    # Let the browser load for 1 seconds before scraping data
    time.sleep(1)
    
    # Soupify the page
    soup = bs(browser.html, 'html.parser')
    
    # Find the link to the image in the 'ul' tag, then the 'a' tag, and then call the 2nd item 'href'
    # Store link string in variable 'image_link_hemi'
    image_link_hemi = soup.find('ul').find_all('a')[0]['href']
    
    # Find the title name using the 'h2' tag and class attribute 'title', and then pull the .text
    # Store title in variable 'title_text'
    title_text = soup.find('h2', class_='title').text
    
    # Append the hemisphere list with a dictionary of the keys and values
    hemisphere_image_list.append({
        'title': title_text, 
        'img_url': image_link_hemi
    })
    
    # Print out success message
    print(f'Scrape of {title_text} COMPLETE')
    time.sleep(1)

In [None]:
# Check to see if data is all there
hemisphere_image_list

In [None]:
# Create a dictionary of all the web scraped data
dict_mars_scrape = {
    'news_title': news_title,
    'news_p': news_p,
    'article_url': article_url,
    'featured_image_url': img_url,
    'html_mars_tbl': html_mars_tbl,
    'hemisphere_image_list': hemisphere_image_list,
    # Add the time of the scrape to the dictionary
    'scrape_time': dt.datetime.now()
}

In [None]:
dict_mars_scrape