# Mission to Mars
## Step 1 - Scraping

### Initializing splinter & Beautiful Soup

In [32]:
import pandas as pd
import requests 
import json
import pprint

from splinter import Browser
from bs4 import BeautifulSoup

In [2]:
# Path to chromedriver
!which chromedriver

/usr/local/bin/chromedriver


In [3]:
executable_path = {'executable_path':'/usr/local/bin/chromedriver'}
#browser = Browser('chrome')
browser = Browser('chrome', **executable_path)

In [4]:
browser.visit('http://google.com')


### Web Scraping

In [5]:
# hitting mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [6]:
# Convert html to a soup object and then quit the browser
html = browser.html
bs = BeautifulSoup(html, 'html.parser')

slide_elem = bs.select_one('ul.item_list li.slide')

In [7]:
slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8359/meet-the-people-behind-nasas-insight-mars-lander/" target="_self">Meet the People Behind NASA's InSight Mars Lander</a></div>

In [8]:
# Use the parent element to find the first a tag 
top_title = slide_elem.find("div", class_='content_title').get_text()
top_title

"Meet the People Behind NASA's InSight Mars Lander"

In [9]:
# Use the parent element to find the paragraph text
top_title_news = slide_elem.find('div', class_="article_teaser_body").get_text()
top_title_news

'A series of NASA videos highlight scientists and engineers leading the next mission to Mars.'

### JPL Space Images Featured Image
##### Visit the url for JPL's Featured Space Image here.

##### Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.

##### Make sure to find the image url to the full size .jpg image.

##### Make sure to save a complete url string for this image

In [10]:
# hit for pics
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [12]:
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [13]:
# Find the more info button and click that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.find_link_by_partial_text('more info')
more_info_elem.click()

In [14]:
# Parse the resulting html with soup
html = browser.html
img_soup = BeautifulSoup(html, 'html.parser')

In [15]:
# find the relative image url
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA16105_hires.jpg'

In [16]:
# Use the base url to create an absolute url
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16105_hires.jpg'

### Mars Weather

###### Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather.

In [27]:
# URL of the page to be scraped
Mars_Twitter_URL = 'https://twitter.com/marswxreport?lang=en'
# Initiate the splinter browser function to visit the Mars Twitter URL
browser.visit(Mars_Twitter_URL)

# Creating a simple for loop to scrape the first tweet
for text in browser.find_by_css('.tweet-text'): # Searching for all the tweets
    if text.text.partition(' ')[0] == 'Sol': # Selecting the 'first' tweet in the web page
        mars_weather = text.text # storing the tweet in a variable
        break
print(mars_weather) # printing the text format of the tweet


Sol 2108 (2018-07-12), Sunny, high -24C/-11F, low -65C/-84F, pressure at 8.06 hPa, daylight 05:19-17:27


### Mars Facts
##### Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

In [30]:
# URL of the page to be scraped
Mars_Facts_URL = 'http://space-facts.com/mars/'
# Creating Dataframe with the read HTML functionality
mars_df =  pd.read_html (Mars_Facts_URL, attrs = {'id': 'tablepress-mars'})[0]
# Renaming the columns of the dataframe 
mars_df.columns = ['Measurements', 'Values']
mars_df = mars_df.set_index('Measurements') # Changing the index to Measurements
# Displaying the dataframe
mars_df

Unnamed: 0_level_0,Values
Measurements,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [33]:
# Converting our Dataframe to HTML table string using .to_html() feature
mars_facts_HTML_table_string = mars_df.to_html()
pprint.pprint(mars_facts_HTML_table_string) # printing the table string for verfications

('<table border="1" class="dataframe">\n'
 '  <thead>\n'
 '    <tr style="text-align: right;">\n'
 '      <th></th>\n'
 '      <th>Values</th>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Measurements</th>\n'
 '      <th></th>\n'
 '    </tr>\n'
 '  </thead>\n'
 '  <tbody>\n'
 '    <tr>\n'
 '      <th>Equatorial Diameter:</th>\n'
 '      <td>6,792 km</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Polar Diameter:</th>\n'
 '      <td>6,752 km</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Mass:</th>\n'
 '      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Moons:</th>\n'
 '      <td>2 (Phobos &amp; Deimos)</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Orbit Distance:</th>\n'
 '      <td>227,943,824 km (1.52 AU)</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Orbit Period:</th>\n'
 '      <td>687 days (1.9 years)</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>Surface Temperature:</th>\n'
 '      <td>-153 to 20 °C</td>\n'
 '    </tr>\n'
 '    <tr

### Mars Hemispheres
Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.

You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

Save both the image url string for the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.

Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [34]:
# Store the URL in a variable
Mars_Astro_URL = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

#Initiate the splinter browser function to visit the Mars Astro URL
browser.visit(Mars_Astro_URL)
Astro_Response = requests.get(Mars_Astro_URL) # Storing the response variable, Retrieve page with the requests module

#Create BeautifulSoup object; parse with 'HTML'
Astro_Soup = BeautifulSoup(Astro_Response.text, 'html.parser') # Storing the beautiful soup variable for parsing our HTML

# Retrieve the parent div tags (<a> </a>) for all articles
Hemispheres_List = Astro_Soup.find_all('a', class_="itemLink product-item") # find all the <a> </a> elements
# print (Hemispheres_List) # print the list just for verifications

In [35]:
# Initialize array to store all the results - this will be an array of dictionaries
hemisphere_image_urls = []

# Loop through results to retrieve article image URL and Title

for image in Hemispheres_List: # start the for loop, Hemispheres_List was defined in the prior cell above
    image_title = image.find('h3').text # Image titles are in <h3> </h3> tags, found via inspecting the page
    image_link = "https://astrogeology.usgs.gov/" + image['href'] # appending the image link with leading URL and <href> tags
    
    # This function will request the links to be clicked to in order to find the image url to the full resolution image.
    image_request = requests.get(image_link) 
    # Storing the beautiful soup variable for parsing our HTML as we go to a new page
    soup = BeautifulSoup(image_request.text, 'html.parser')
    # Storing image tag variable by finding div in class 'downloads' -> this is found by inspecting the image URL
    image_tag = soup.find('div', class_='downloads')
    # Storing image URL variable loacated in <a> href </a> portion -> this is found by inspecting the image URL
    image_url = image_tag.find('a')['href']
    # Appending all the information in our array of dictionaries, as asked for in the homework    
    hemisphere_image_urls.append({"Title": image_title, "Image_URL": image_url})
    
# Printing the dictionary
pprint.pprint(hemisphere_image_urls)

[{'Image_URL': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'Title': 'Cerberus Hemisphere Enhanced'},
 {'Image_URL': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'Title': 'Schiaparelli Hemisphere Enhanced'},
 {'Image_URL': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'Title': 'Syrtis Major Hemisphere Enhanced'},
 {'Image_URL': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'Title': 'Valles Marineris Hemisphere Enhanced'}]


#### Close browser

In [36]:
browser.quit()