## Article Scraping

In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd 

In [2]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path)


### Visit the NASA mars news site

In [3]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# Optional delay for loading the page. Telling our browser to wait a second before searching for components:
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [4]:
# Set up the HTML parser

html = browser.html
news_soup = BeautifulSoup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [5]:
# Scraping : assign the title and summary text to variables we’ll reference later. 
# In this line of code, we chained .find onto our previously assigned variable, "slide_elem." When we do this, we’re saying,
# “This variable holds a ton of information, so look inside of that information to find this specific data.


# In other words, identify the parent element and create a variable to hold it

slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8568/nasas-treasure-map-for-water-ice-on-mars/" target="_self">NASA's Treasure Map for Water Ice on Mars</a></div>

In [6]:
# Get just the text of the output above, and the extra HTML stuff isn’t necessary.
# Use the parent element to find the first `a` tag (text) and save it as `news_title`


# Search within the element for the title.And stripe the additional HTML attributes and tags with the use of .get_text().
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

"NASA's Treasure Map for Water Ice on Mars"

In [7]:
# Next add the summary text.
# This time, we’re searching for the summary instead of the title, need to use the unique class associated with the summary ie
# “article_teaser_body.”

# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

'A new study identifies frozen water just below the Martian surface, where astronauts could easily dig it up.'

Important
There are two methods used to find tags and attributes with BeautifulSoup:
.find() is used when we want only the first class and attribute we’ve specified.
.find_all() is used when we want to retrieve all of the tags and attributes.
For example, if we were to use .find_all() instead of .find() when pulling the summary, we would retrieve all of the summaries on the page instead of just the first one.


## Image Scraping

"### Featured Images"

In [8]:
# Set up the URL to visit the site

url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [9]:
# Find and click the full image button, Use the id in our code
full_image_elem = browser.find_by_id('full_image')      # the browser frinds an element with its id and hplds it in a var
full_image_elem.click()                                 # splinter will "click" the image to view its full size

In [10]:
# Use Splinter’s functionality ability to find elements using text.

# Find the more info button using only text and "click" that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.find_link_by_partial_text('more info')
more_info_elem.click()

In [11]:
# Scrape the full-size image URL
# Parse the resulting html with soup
html = browser.html
img_soup = BeautifulSoup(html, 'html.parser')

In [12]:
# Tags used to find the most recent image :  <figure /> and <a /> tags have the image link nested within them.
# Use (<figure />, <a />, and <img />) to build the URL to the full-size image.

# Find the relative image url
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA17793_hires.jpg'

In [13]:
# Add the base URL to our code.
# Use the base URL to create an absolute URL

img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17793_hires.jpg'

In [14]:
# Scraping an entire table with Pandas’ .read_html() function.
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [15]:
# Adding the DataFrame to a web application
# With Pandas convert DataFrame back into HTML-ready code using the .to_html()

df.to_html()

<bound method DataFrame.to_html of                                               value
description                                        
Equatorial Diameter:                       6,792 km
Polar Diameter:                            6,752 km
Mass:                 6.39 × 10^23 kg (0.11 Earths)
Moons:                          2 (Phobos & Deimos)
Orbit Distance:            227,943,824 km (1.38 AU)
Orbit Period:                  687 days (1.9 years)
Surface Temperature:                   -87 to -5 °C
First Record:                     2nd millennium BC
Recorded By:                   Egyptian astronomers>

In [16]:
# Deactivating/Turning Off the automated browser session
browser.quit()