In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
from flask import Flask, render_template, redirect
from flask_pymongo import PyMongo
import requests
from datetime import datetime

# Scraping Tasks

In [2]:
# Use Splinter for all tasks unless specified otherwise 
# This avoids issues with pages built by Javascript
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

### Scrape Mars news from JPL

In [4]:
url = 'https://mars.nasa.gov/news/'

In [6]:
browser.visit(url)
# Display time of visit for record-keeping purposes
print(datetime.now())

2019-05-16 14:00:41.497728


In [7]:
# Confirm receipt of info, html should have a non-zero length
html = browser.html
len(html)

1227153

In [8]:
soup = bs(html, 'html.parser')
# Confirm conversion via length check
len(soup)

2

In [9]:
# Manual examination of page showed news title appear at 1st instance of div class=content_title
news_title = soup.find('div',class_='content_title').text
news_title

"NASA's MRO Completes 60,000 Trips Around Mars"

In [10]:
# Manual examination of page showed news paragraph appeared at 1st instance of div class=article_teaser_body
news_blob = soup.find('div',class_='article_teaser_body').text
news_blob

'The orbiting spacecraft is also about to set a record for data relayed from the Martian surface.'

### Scrape the Featured Image

In [11]:
url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url2)
print(datetime.now())

2019-05-16 14:08:27.593259


In [12]:
html2 = browser.html
len(html2)

245394

In [17]:
soup2 = bs(html2,'html.parser')
len(soup2)

4

In [18]:
# Manual examination of the page showed the hi-res image url is in a section class='grid gallery' with an embedded hyperlink
# of class 'fancybox' with the lcoal image url as the data-fancybox-href attribute.
gallery = soup2.find('section', class_='grid_gallery').find('a',class_='fancybox')['data-fancybox-href']
gallery

'/spaceimages/images/largesize/PIA23220_hires.jpg'

In [19]:
# Manually add url front end -- checked by navigating to image
featured_image_url = 'https://jpl.nasa.gov' + gallery
featured_image_url

'https://jpl.nasa.gov/spaceimages/images/largesize/PIA23220_hires.jpg'

### Scrape Mars Weather Tweet

In [3]:
url3 = 'https://twitter.com/marswxreport?lang=en'

In [4]:
browser.visit(url3)
soup3 = bs(browser.html, 'html.parser')
print(datetime.now())
len(soup3)

2019-05-17 09:37:38.741483


2

In [5]:
# Manual inspection of the page showed tweet content at p class='js-tweet-text' 
# but that the tweet screen name needs to be 'MarsWxReport' inside a div class='tweet'
weather_tweet = soup3.find('div', attrs={'data-screen-name':'MarsWxReport', 'class':'tweet'})\
                     .find('p', class_='js-tweet-text').text
#.find('p',class_ = 'js-tweet-text').text
weather_tweet

'InSight sol 165 (2019-05-15) low -100.3ºC (-148.6ºF) high -18.2ºC (-0.7ºF)\nwinds from the SW at 4.6 m/s (10.4 mph) gusting to 13.7 m/s (30.6 mph)\npressure at 7.50 hPapic.twitter.com/7NMgdAkFA8'

### Scrape Mars Facts Table

In [31]:
url4 = 'http://space-facts.com/mars/'

In [32]:
mars_df = pd.read_html(url4)[0]
mars_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [33]:
fact_string = mars_df.to_html(header=False, index=False)
fact_string

'<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

### Scrape Mars Hemispheres

In [34]:
url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [36]:
browser.visit(url5)
print(datetime.now())

2019-05-16 16:23:32.256720


In [37]:
html5 = browser.html
page_data = bs(html5,'html.parser')
len(page_data)

2

In [38]:
# Build the list of links to visit that contain the images.  Manually inspection of the page
# showed that these links are in <a> tags with class='product-item'
# Because these links are not directly visibile on the page, Splinter cannot just 'click' them for us,
# we have to collect and visit these links ourselves... 
linklist = page_data.find_all('a',class_='product-item')
urllist = [item['href'] for item in linklist]
# The links are in two places per item, so we only need every other one
urllist2 = urllist[::2]
urllist2

['/search/map/Mars/Viking/cerberus_enhanced',
 '/search/map/Mars/Viking/schiaparelli_enhanced',
 '/search/map/Mars/Viking/syrtis_major_enhanced',
 '/search/map/Mars/Viking/valles_marineris_enhanced']

In [41]:
# Manual inspection of the target pages showed the hemisphere names in h2 tags followed by ' Enhanced' 
# so removing the last nine characters provides the hemisphere name as we want it
# These pages do not have Javascript or complex elements so we can just use requests.get
# The target names and images will go into a list of dictionaries
imagedictlist = []
for item in urllist2:
    newurl = 'https://astrogeology.usgs.gov' + item
    soupitem = bs(requests.get(newurl).text,'html.parser')
    hem_name = soupitem.find('h2').text[:-9]
    hem_imgurl = 'https://astrogeology.usgs.gov' + soupitem.find('img', class_ = 'wide-image')['src']
    imagedictlist.append({'title':hem_name,'img_url':hem_imgurl})
imagedictlist

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]