# Scraping using BeautifulSoup, Pandas, and Requests/Splinter

In [1]:
## Dependencies

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


import requests
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
import pandas as pd
import re

from urls_list import * #where all urls and paths are saved

## NASA Mars News

* **Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. Assign the text to variables that we can reference later.**

In [2]:
def scrape_latest_news():
    #########################################################################################
    #Scrape the latest news
    #Returns news_title, news_p
    #########################################################################################
    
    news_title, news_p = None, None
    #Configure Browser
    browser = Browser(browser_choice, executable_path=executable_path, headless=True)
    try:
        #Visit url
        browser.visit(nasa_mars_news)
        #bs object with lxml parser
        time.sleep(4)#This is very important
        soup = bs(browser.html, 'lxml')
        news = soup.find('li', class_='slide').div.find(class_='list_text').find_all('div')
        news_title, news_p = news[1].text, news[2].text
    
    except Exception as e:
        print(e)
            
    #Close browser to avoid resource issue
    browser.quit()
    
    
    return (news_title, news_p)

In [3]:
news_title, news_p = scrape_latest_news()

In [4]:
news_title
news_p

"NASA's MAVEN Observes Martian Night Sky Pulsing in Ultraviolet Light"

'Vast areas of the Martian night sky pulse in ultraviolet light, according to images from NASA’s MAVEN spacecraft. The results are being used to illuminate complex circulation patterns in the Martian atmosphere.'

## JPL Mars Space Images - Featured Image

* **Find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url**

In [5]:
def scrape_featured_image_url():
    
    #########################################################################################
    #Scrape the featured image url from nasa jpl site
    #Returns featured_image_url
    #########################################################################################
    
    featured_image_url = None
    #Configure Browser
    browser = Browser(browser_choice, executable_path=executable_path, headless=True)

    try:
        #Visit url
        browser.visit(nasa_jpl)
        #bs object with lxml parser
        time.sleep(4)#This is very important
        soup = bs(browser.html, 'lxml')

        #Click a button "FULL IMAGE"
        browser.find_by_id('full_image', wait_time=1).click()

        #Click more info button
        browser.find_by_css('[id="fancybox-lock"]')[0].find_by_css('div[class="buttons"] a:nth-child(2)')[0].click()
        time.sleep(1)

        #Take the image link (largesize)
        featured_image_url = browser.find_by_css('figure[class="lede"] a')['href']
        
    except Exception as e:
        print(e)
            
    #Close browser to avoid resource issue
    browser.quit()

    return featured_image_url

In [6]:

featured_image_url = scrape_featured_image_url()

In [7]:
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16613_hires.jpg'

## Mars Weather - from twitter page

* **Visit the Mars Weather twitter account and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather.**

* **Note: Be sure you are not signed in to twitter, or scraping may become more difficult.**

* **Note: Twitter frequently changes how information is presented on their website.**

In [8]:
def scrape_mars_weather():
    
    #########################################################################################
    #Scrape the latest Mars weather tweet from the twitter page
    #Returns mars_weather
    #########################################################################################
    
    mars_weather = None
    #Configure Browser
    browser = Browser(browser_choice, executable_path=executable_path, headless=True)
    
    try:
        #Visit url
        browser.visit(mars_twitter_page)
        #bs object with lxml parser
        time.sleep(4)#This is very important
        soup = bs(browser.html, 'lxml')
        #Extract the weather info using soup css selector
        mars_weather = soup.find('div', attrs={'data-testid':'tweet'}).select('div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > span')[0].text
    
    except Exception as e:
        print(e)
            
    #Close browser to avoid resource issue
    browser.quit()
    
    return mars_weather
        

In [9]:
scrape_mars_weather()

'InSight sol 605 (2020-08-09) low -92.7ºC (-134.8ºF) high -18.4ºC (-1.1ºF)\nwinds from the WNW at 8.8 m/s (19.7 mph) gusting to 22.5 m/s (50.4 mph)\npressure at 7.90 hPa'

## Mars Facts 

* **Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.**

* **Use Pandas to convert the data to a HTML table string.**


In [10]:
DF = pd.read_html(mars_facts, attrs={'id':'tablepress-p-mars'})[0]
DF.rename(columns={0:'attributes', 1:'value'}, inplace=True)

In [11]:
DF

Unnamed: 0,attributes,value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


## Mars Hemispheres - from USGS Astrogeology site

* **Visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres.**

* **Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.**

* **Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.**

In [12]:
def scrape_hemispheres():
    
    #########################################################################################
    #Scrape high resolution images for each of Mar's hemispheres from USGS Astrogeology site
    #Returns list of dictionaries with title and image urls
    #########################################################################################
    
    hemisphere_image_urls = []
    #Configure Browser
    browser = Browser(browser_choice, executable_path=executable_path, headless=True)

    try:
        #Visit url
        browser.visit(usgs_search)
        #bs object with lxml parser
        time.sleep(10)#This is very important (site loads super slow)
        soup = bs(browser.html, 'lxml')
        hs_links = soup.find(id='product-section').find_all('a',class_="itemLink product-item")
        for index,link in enumerate(hs_links):
            if link.img is None:
                title = re.sub(' Enhanced', '', link.text)
            else:
                browser.visit(usgs_base+link['href'])
                time.sleep(1)
                img_url = browser.find_by_css('img[class="wide-image"]')['src']
            
            if index%2:#Image and title come together
                hemisphere_image_urls.append({'title':title,'img_url':img_url})
    except Exception as e:
        print(e)
            
    #Close browser to avoid resource issue
    browser.quit()
    
    return hemisphere_image_urls

In [13]:
hemisphere_image_urls = scrape_hemispheres()

In [14]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]