## Step 1 - Scraping

In [19]:
#Dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as bs
import requests
import pymongo
import time

In [20]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [21]:
# Define database and collection
db = client.mars_db
collection = db.mars_data

### Step 1 - NASA Mars News

In [22]:
# may be chromedriver.exe
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [23]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

browser.visit(url)

html = browser.html

# Create BeautifulSoup object; parse with 'html'
soup = bs(html, 'html.parser')

In [24]:
title_header = soup.find('div', class_='list_text')
news_title = title_header.find('div', class_="content_title").text
news_title

'My Culture, My Voice'

In [25]:
news_p = title_header.find('div', class_="article_teaser_body").text
news_p

'In honor of Hispanic Heritage Month, Christina Hernandez, an instrument engineer on the Mars 2020 mission, talks about her childhood and journey to NASA.'

### Step 1 - JRL Mars Space Images

In [26]:
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

browser.visit(url)

#Find full size image after click Full image button
browser.click_link_by_partial_text("FULL IMAGE")

#Move to more info
browser.click_link_by_partial_text("more info")

# img = img_soup.select_one("figure.lede a img")
html = browser.html

# Create BeautifulSoup object; parse with 'html'
soup = bs(html, 'html.parser')

#Find image
img_url_relative_path = soup.find('figure', class_='lede').a['href']
                        

#use base url to create absolute url
featured_img_url = f"https://www.jpl.nasa.gov{img_url_relative_path}"

#     return featured_img_url
featured_img_url



'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17171_hires.jpg'

### Step 1 - Mars Facts

In [27]:
#MARS FACTS Scrapes
url = 'https://space-facts.com/mars/'

In [28]:
#Returning a list of dataframes for any tabular data that Pandas found
tables = pd.read_html(url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:     -87 to -5 °C      -88 to 58°C,
           

In [29]:
type(tables)

list

In [30]:
#Put into a DF so we can slice what we want with normal indexing
mars_df = tables[2]
mars_df.head()

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"


In [31]:
# Convert the DF to HTML to generate HTML tables
html_table = mars_df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    <

In [32]:
#Ability to strip unwanted "newlines" to clean up the table
# html_table.replace

### Step 1 - Mars Hemispheres

In [33]:
# Define and retrieve the page
hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemisphere_url)
time.sleep(1)
html = browser.html
soup = bs(html, "html.parser")

In [34]:
# test = soup.find_all('div', class_='item')
# test

In [35]:
# Blank list to contain the dictionaries
hemisphere_image_urls = []
# Base image url
baseimg_url="https://astrogeology.usgs.gov/"
# Soup object
hemispheres = soup.find_all('div', class_='item')
# Loop to get each title & url
for hemi in hemispheres:
    title = hemi.find('h3').text
    link = 'https://astrogeology.usgs.gov/' + hemi.find('a')['href']
    browser.visit(link)
    img_html = browser.html
    img_soup = bs(img_html, "html.parser")
    imgs_url = img_soup.find("img", class_="wide-image")["src"]
    image_url = baseimg_url+imgs_url
    hemisphere_image_urls.append({"title": title, "img_url": image_url})
# Display final dict
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [37]:
mars_data = {
    "news_title": news_title,
    "news_p": news_p,
    "featured_image": featured_img_url,
    "mars_facts": html_table,
    "hemispheres": hemisphere_image_urls
}
    # Return results
db.mars_data.insert_one(mars_data)
mars_data

{'news_title': 'My Culture, My Voice',
 'news_p': 'In honor of Hispanic Heritage Month, Christina Hernandez, an instrument engineer on the Mars 2020 mission, talks about her childhood and journey to NASA.',
 'featured_image': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17171_hires.jpg',
 'mars_facts': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n  

## Step 2 - Mongo DB and Flask Application