# Scraping the NASA Mars News Site and collect the latest News Title and Paragraph Text. Assign the text to variables to reference later.

In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import pandas as pd
import time

# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

/usr/local/bin/chromedriver


In [2]:
# Create the exe path for chrome to open chrome page
# Will open a chrome window
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=True)

## NASA Mars News

In [3]:
# Visit the site to scrape
# Will go to the website and extract the browser url
news_url = "https://mars.nasa.gov/news/"
browser.visit(news_url)
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

# Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
news_html = browser.html
soup = bs(news_html, 'lxml')
#print(soup.prettify())

news_title = soup.find('div', class_='content_title').text
news_p = soup.find('div', class_='article_teaser_body').text

print (f'Title: {news_title}')
print (f'Paragraph: {news_p}')

Title: Mars 2020 Rover: T-Minus One Year and Counting 
Paragraph: The launch period for NASA's next rover, Mars 2020, opens exactly one year from today, July 17, 2020, and extends through Aug. 5, 2020.


## JPL Mars Space Images - Featured Image

In [4]:
# Visit the site to scrape
# Will go to the website and extract the browser url
jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(jpl_url)
time.sleep(1)

# Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
jpl_html = browser.html
soup = bs(jpl_html, 'lxml')
#print(soup.prettify())

image_link = soup.find('div',class_='carousel_container').article.footer.a['data-fancybox-href']
featured_image_url_medium = f'https://www.jpl.nasa.gov{image_link}'
print (f'Medium picture: {featured_image_url_medium}')

time.sleep(1)
full_image_elem = browser.find_by_id("full_image")
full_image_elem.click()

time.sleep(1)
more_info_elem = browser.find_link_by_partial_text('more info')
more_info_elem.click()

html = browser.html
img_soup = bs(html, 'lxml')

img_url_rel = img_soup.select_one('figure.lede a img').get("src")

featured_image_url_large = f'https://www.jpl.nasa.gov{img_url_rel}'

print (f'Large picture: {featured_image_url_large}')

Medium picture: https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA17652_ip.jpg
Large picture: https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17652_hires.jpg


## Mars Weather

In [5]:
# Visit the site to scrape
# Will go to the website and extract the browser url
weather_url = "https://twitter.com/marswxreport?lang=en"
browser.visit(weather_url)
time.sleep(1)

# Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
weather_html = browser.html
soup = bs(weather_html, 'lxml')
#print(soup.prettify())

weather_all = soup.find_all('div', class_='js-tweet-text-container')

weather_list = []
for x in weather_all:
    y = x.find('p', class_= 'js-tweet-text').text
    if "InSight" in y:
            weather_list.append(y)

mars_weather = weather_list[0]
print (weather_list[0])

InSight sol 222 (2019-07-12) low -99.7ºC (-147.5ºF) high -24.8ºC (-12.6ºF)
winds from the SSE at 4.2 m/s (9.4 mph) gusting to 15.6 m/s (34.8 mph)
pressure at 7.60 hPapic.twitter.com/8Q8lyB6SjM


## Mars Facts

In [6]:
# Visit the site to scrape
# Will go to the website and extract the browser url
facts_url = "https://space-facts.com/mars/"
browser.visit(facts_url)

# Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
facts_html = browser.html
soup = bs(facts_html, 'lxml')
#print(soup.prettify())

facts_str = pd.read_html(facts_url)
#facts_str[0]
facts_str[1]

# # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.to_html.html
facts_str[1].to_html("facts.html", index = False, header = False)
facts_html = facts_str[1].to_html(index = False, header = False)
# facts_html

## Mars Hemispheres

In [43]:
# Visit the site to scrape
# Will go to the website and extract the browser url
hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(hemisphere_url)

# Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
hemisphere_html = browser.html
soup = bs(hemisphere_html, 'lxml')
#print(soup.prettify())

# Find the links
image_urls = [(a.text, a['href']) for a in browser.find_by_css('div[class="description"] a')]
#print (image_urls)

hemisphere_image_urls = []

for title,url in image_urls:
    temp = {}
    temp['title'] = title
    browser.visit(url)
    img_url = browser.find_by_css('img[class="wide-image"]')['src']
    temp['img_url'] = img_url
    hemisphere_image_urls.append(temp)

hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]