In [17]:
#Import our dependencies
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
#Set a path to chrome and setup our url
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/96.0.4664.45/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\Origin\.wdm\drivers\chromedriver\win32\96.0.4664.45]


In [3]:
#Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
#set a slight delay in page loading
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

The final line of code in above cell searches tag: div and with class: item_list  

In [4]:
#Set up HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text') #The parent element is stored in slide_elem

In [5]:
#Obtain the title of the first article:
slide_elem.find('div', class_='content_title')

<div class="content_title">NASA InSight's 'Mole' Is Out of Sight</div>

In [7]:
#Filter to get just the text of the title:
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

"NASA InSight's 'Mole' Is Out of Sight"

In [8]:
#Get the summary of the first article
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

"Now that the heat probe is just below the Martian surface, InSight's arm will scoop some additional soil on top to help it keep digging so it can take Mars' temperature."

### Featured images

In [9]:
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [10]:
#browser has a lot of available methods and dunder methods
dir(browser)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cookie_manager',
 'attach_file',
 'back',
 'check',
 'choose',
 'click_link_by_href',
 'click_link_by_id',
 'click_link_by_partial_href',
 'click_link_by_partial_text',
 'click_link_by_text',
 'cookies',
 'driver',
 'driver_name',
 'element_class',
 'evaluate_script',
 'execute_script',
 'fill',
 'fill_form',
 'find_by',
 'find_by_css',
 'find_by_id',
 'find_by_name',
 'find_by_tag',
 'find_by_text',
 'find_by_value',
 'find_by_xpath',
 'find_link_by_href',
 'find_link_by_partial_href',
 'find_link_by_partial_text',
 'find_link_by_text',
 'find_option_by_text',
 'find_option_by_value',
 'forwa

In [13]:
#find and click the full image button
full_image_elem = browser.find_by_tag('button')[1] #First button element in webpage html
full_image_elem.click()

In [14]:
html = browser.html
img_soup = soup(html, 'html.parser')

In [15]:
#Find the relative (featured) image url:
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel
#Note that get('src') pulls a link to that image

'image/featured/mars2.jpg'

In [16]:
#Now we need to add this to the base url to get a complete url:
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url



'https://spaceimages-mars.com/image/featured/mars2.jpg'

In [18]:
#We are now going to scrape the mars facts table from https://galaxyfacts-mars.com/
#read_html will automatically print all tables found in a particular html
df = pd.read_html('https://galaxyfacts-mars.com/')[0]  #The zero tell pandas to import first table it sees
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace = True)
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [19]:
#We can convert the dataframe back to html format:
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [20]:
#End our session (very important as it shuts down the automated browser)
browser.quit()

In [None]:
#We can convert this entire notebook into a .py file by using the file download menu