# Step 1 - Scraping

## Declare dependencies

In [1]:
# Dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from urllib.parse import urlparse

## Initialize Browser Funcion

In [2]:
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

## Run the Initialize Browser Function

In [3]:
# Run init_browser function and open it
# browser = init_browser()

## NASA Mars News

In [4]:
# Run init_browser function and open it
browser = init_browser()
# Visit https://mars.nasa.gov/news/
url = "https://mars.nasa.gov/news/"
browser.visit(url)

time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")
# Get Title and Paragrapht Text
news_title = soup.find('div', class_="content_title").text
news_p = soup.find('div', class_="article_teaser_body").text

browser.quit()
print(news_title + "\n" +  news_p)

Mars InSight's Mole Has Partially Backed Out of Its Hole
After making progress over the past several weeks digging into the surface of Mars, InSight's mole has backed about halfway out of its hole this past weekend.


## JPL Mars Space Images - Featured Image

In [5]:
# Run init_browser function and open it
browser = init_browser()
# Visit https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")
time.sleep(1)

# Click in one of the images
browser.click_link_by_partial_text('Curiosity')
time.sleep(5)

# Click in the button for more information
browser.click_link_by_partial_text('more info')
time.sleep(1)

# Get the new url after clicking "more info" and visiting it
url = browser.url
browser.visit(url)
time.sleep(1)

# Scrape clicked page into Soup
html = browser.html
soup = bs(html, "html.parser")

# Get the base url to be used with relative paths
parsed = urlparse(url)
base_url = parsed.scheme +"://"+ parsed.netloc

# Get the high resolution image from the accesed page
featured_image_url = base_url +  soup.find('img', class_="main_image")["src"]

browser.quit()
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA23378_hires.jpg


## Mars Weather

In [6]:
# Run init_browser function and open it
browser = init_browser()
# Visit https://twitter.com/marswxreport?lang=en
url = "https://twitter.com/marswxreport?lang=en"
browser.visit(url)

time.sleep(2)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

# Get the image URL

# Get all the text
mars_weather_p_a = soup.find("div", class_="js-tweet-text-container").text

# Get the text in the last part
mars_weather_a = soup.find("div", class_="js-tweet-text-container").findChildren()[1].text

# Eliminate the last part of the text
mars_weather = mars_weather_p_a.replace(mars_weather_a,'')

browser.quit()
print(mars_weather)


InSight sol 330 (2019-10-31) low -101.8ºC (-151.3ºF) high -24.8ºC (-12.6ºF)
winds from the SSE at 5.4 m/s (12.2 mph) gusting to 20.8 m/s (46.5 mph)
pressure at 7.00 hPa



## Mars Facts

In [7]:
# Run init_browser function and open it
browser = init_browser()
# Visit https://space-facts.com/mars/
url = "https://space-facts.com/mars/"

time.sleep(1)

# Scrape with Paandas
tables_df = pd.read_html(url)

# Get first table with the facts
mars_facts_df = tables_df[1]

# Rename column of the dataframe
mars_facts_df = mars_facts_df.rename(columns={0: "Description", 1: "Value"})


# Use the titles as index
mars_facts_df.set_index("Description", inplace=True)

# Convert the dataframe to html
mars_facts_html = mars_facts_df.to_html()

browser.quit()
print (mars_facts_html)
print (mars_facts_df)


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Value</th>
    </tr>
    <tr>
      <th>Description</th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>
                            

## Mars Hemispheres

In [8]:
# Run init_browser function and open it
browser = init_browser()

# Visit https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

# Get the base url to be used with relative paths
parsed = urlparse(url)
base_url = parsed.scheme +"://"+ parsed.netloc

browser.visit(url)

time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

# Get the image URL

results = soup.find("div", class_="collapsible results")

# Initialize the Dictionary
hemisphere_image_urls = []
# Loop through returned results
for result in results:
    # Error handling
    try:
        # Identify and return title of the image
        title = result.find("div").h3.text
        
        # Identify and return link to the high resolution image
        link_parent = base_url + result.a['href']

        # Use the link to the page to get the high resolution Page
        link_son = link_parent
        browser.visit(link_son)

        time.sleep(1)

        # Scrape the socond page into Soup
        html = browser.html
        soup = bs(html, "html.parser")

        # Get the image URL of the high resolution URL
        # For getting the Original .tif
        link_son = soup.find("a", text="Original")['href']
        
        # For getting the Sample .jpg
        #link_son = soup.find("a", text="Sample")['href']

        # Print results only if title, price, and link are available
        if (title and link_parent):
            #print('-------------')
            #print(title)
            #print(link_parent)
            #print(link_son)

            # Create Dictionary
            url_dict = {}
            url_dict["title"] = title
            url_dict["img_url"] = link_son
            hemisphere_image_urls.append(url_dict)

    except AttributeError as e:
            #print(e)
            pass

browser.quit()
print (hemisphere_image_urls)

[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]


## Final Result Dictionary

In [9]:
# Store data in a dictionary
scraping_results = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "mars_facts_html": mars_facts_html,
        "hemisphere_image_urls": hemisphere_image_urls
    }
scraping_results

{'news_title': "Mars InSight's Mole Has Partially Backed Out of Its Hole",
 'news_p': "After making progress over the past several weeks digging into the surface of Mars, InSight's mole has backed about halfway out of its hole this past weekend.",
 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA23378_hires.jpg',
 'mars_weather': '\nInSight sol 330 (2019-10-31) low -101.8ºC (-151.3ºF) high -24.8ºC (-12.6ºF)\nwinds from the SSE at 5.4 m/s (12.2 mph) gusting to 20.8 m/s (46.5 mph)\npressure at 7.00 hPa\n',
 'mars_facts_html': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Value</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10