# Scraping using BeautifulSoup, Pandas, and Requests/Splinter

In [154]:
## Dependencies

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


import requests
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time

from urls_list import * #where all urls and paths are saved

## NASA Mars News

* **Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. Assign the text to variables that we can reference later.**

In [155]:
def scrape_latest_news():
    #########################################################################################
    #Scrape the latest news
    #Returns news_title, news_p
    #########################################################################################
    
    news_title, news_p = None, None
    #Configure Browser
    browser = Browser(browser_choice, executable_path=executable_path, headless=True)
    try:
        #Visit url
        browser.visit(nasa_mars_news)
        #bs object with lxml parser
        time.sleep(4)#This is very important
        soup = bs(browser.html, 'lxml')
        news = soup.find('li', class_='slide').div.find(class_='list_text').find_all('div')
        news_title, news_p = news[1].text, news[2].text
    
    except Exception as e:
        print(e)
            
    #Close browser to avoid resource issue
    browser.quit()
    
    
    return (news_title, news_p)

In [156]:
news_title, news_p = scrape_latest_news()

In [157]:
news_title
news_p

"NASA's MAVEN Observes Martian Night Sky Pulsing in Ultraviolet Light"

'Vast areas of the Martian night sky pulse in ultraviolet light, according to images from NASA’s MAVEN spacecraft. The results are being used to illuminate complex circulation patterns in the Martian atmosphere.'

## JPL Mars Space Images - Featured Image

* **Find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url**

In [158]:
def scrape_featured_image_url():
    
    #########################################################################################
    #Scrape the featured image url from nasa jpl site
    #Returns featured_image_url
    #########################################################################################
    
    featured_image_url = None
    #Configure Browser
    browser = Browser(browser_choice, executable_path=executable_path, headless=True)

    try:
        #Visit url
        browser.visit(nasa_jpl)
        #bs object with lxml parser
        time.sleep(4)#This is very important
        soup = bs(browser.html, 'lxml')

        #Click a button "FULL IMAGE"
        browser.find_by_id('full_image', wait_time=1).click()

        #Click more info button
        browser.find_by_css('[id="fancybox-lock"]')[0].find_by_css('div[class="buttons"] a:nth-child(2)')[0].click()
        time.sleep(1)

        #Take the image link (largesize)
        featured_image_url = browser.find_by_css('figure[class="lede"] a')['href']
        
    except Exception as e:
        print(e)
            
    #Close browser to avoid resource issue
    browser.quit()

    return featured_image_url

In [159]:

featured_image_url = scrape_featured_image_url()

In [160]:
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA19920_hires.jpg'

## Mars Weather - from twitter page

* **Visit the Mars Weather twitter account and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather.**

* **Note: Be sure you are not signed in to twitter, or scraping may become more difficult.**

* **Note: Twitter frequently changes how information is presented on their website.**

In [161]:
def scrape_mars_weather():
    
    #########################################################################################
    #Scrape the latest Mars weather tweet from the twitter page
    #Returns mars_weather
    #########################################################################################
    
    mars_weather = None
    #Configure Browser
    browser = Browser(browser_choice, executable_path=executable_path, headless=True)
    
    try:
        #Visit url
        browser.visit(mars_twitter_page)
        #bs object with lxml parser
        time.sleep(4)#This is very important
        soup = bs(browser.html, 'lxml')
        #Extract the weather info using soup css selector
        mars_weather = soup.find('div', attrs={'data-testid':'tweet'}).select('div:nth-of-type(2) > div:nth-of-type(2) > div:nth-of-type(1) > div:nth-of-type(1) > span')[0].text
    
    except Exception as e:
        print(e)
            
    #Close browser to avoid resource issue
    browser.quit()
    
    return mars_weather
        

In [162]:
scrape_mars_weather()

'InSight sol 603 (2020-08-07) low -91.3ºC (-132.4ºF) high -12.2ºC (10.0ºF)\nwinds from the W at 6.6 m/s (14.8 mph) gusting to 17.2 m/s (38.4 mph)\npressure at 7.90 hPa'