In [1]:
# Import Dependencies
import requests as req
import pandas as pd
import time
from bs4 import BeautifulSoup as bs
from splinter import Browser


In [2]:
#--- STEP 1 - Scraping ---
# NASA Mars News

In [3]:
executable_path = {"executable_path": "chrome/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

mars_url = "https://mars.nasa.gov/news/"
browser.visit(mars_url)

html_browser = browser.html
news_from_mars = bs(html_browser, 'html.parser')

#Use prettify to inspect the html doc 
#print(news_from_mars.prettify())

#Check the elements to know where to find the div element that contains title  
news_title = news_from_mars.find('div', class_='content_title').text
#Print the latest title
print("The latest News Title from Mission to Mars is:",news_title)

#Check the elements to know where to find the div element that contains paragraph  
news_paragraph = news_from_mars.find('div', class_="rollover_description_inner").text
#Print the paragraph from the latest title
print("The content of the latest News Title from Mission to Mars is:",news_paragraph)

The latest News Title from Mission to Mars is: NASA’s First Mission to Study the Interior of Mars Awaits May 5 Launch
The content of the latest News Title from Mission to Mars is: All systems are go for NASA’s next launch to the Red Planet. 


In [4]:
#---JPL MARS Space Images - Featured Image

In [5]:
# Point to JPL's Featured Space Images page. 
jpl_mars_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(jpl_mars_url)

# Scrape the browser to navigate the site and find the image url for the current Featured Mars Image 
html_browser = browser.html
jpl_mars = bs(html_browser, 'html.parser')

#Use prettify to inspect the html doc 
#print(jpl_mars.prettify())


#find the url of the img
img_url = jpl_mars.find('a', {'id': 'full_image', 'data-fancybox-href': True}).get('data-fancybox-href')
#get the base url from the href of the webpage (jpl_logo class)
base_jpl_href = jpl_mars.find_all('div', class_='jpl_logo')
#use BS to create obkect, parse with lxml
jpl_logo_soup = bs(html_browser, 'lxml')
#loop through all the href of the url

# Get all the hrefs of the url
href_list = []
for href in jpl_logo_soup.find_all('a'):
    href_list.append(href.get('href'))
 #print(href_list)

#retrieve the path for jpl.nasa
jpl_nasa_path = href_list[1].strip('/')
#jpl_nasa_path

# Assign the url string to a variable called featured_image_url
featured_image_url = '%s%s%s'%("https://",jpl_nasa_path,img_url)
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA15254_ip.jpg


In [6]:
#---Mars Weather---

In [7]:
# Point to Mars weather twitter page. 
twitter_url = "https://twitter.com/marswxreport?lang=en"
browser.visit(twitter_url)

# Scrape the browser to navigate the site and get the latest Mars weather tweet from the page.
html_browser = browser.html
mars_twitter = bs(html_browser, 'html.parser')

#inspect the weather tweet
weather_tweet = mars_twitter.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')

#Save the tweet text for the weather report as a variable called mars_weather.
mars_weather = weather_tweet.text
print(mars_weather)

Sol 2039 (May 02, 2018), Sunny, high 0C/32F, low -74C/-101F, pressure at 7.28 hPa, daylight 05:23-17:20


In [8]:
#---Mars Facts --

In [9]:
# Point to Mars Facts webpage. 
mars_facts_url = "https://space-facts.com/mars/"
browser.visit(mars_facts_url)

# Scrape the browser to navigate the site and get the Mars facts
html_browser = browser.html
mars_facts = bs(html_browser, 'html.parser')

# Convert the url to a pandas df
mars_facts_pd = pd.read_html(mars_facts_url)
mars_facts_pd
mars_facts_df = pd.DataFrame(mars_facts_pd[0])
mars_facts_df

# Define column names   
mars_facts_df.columns = ['Parameters','Data']
#Set the index
mars_facts_df2 = mars_facts_df.set_index("Parameters")
mars_facts_df2

#Convert the dataframe to an html object and remove unnecesary \n
mars_facts_html=mars_facts_df2.to_html(classes='mars-facts')
mars_facts_html
mars_facts_html_table = mars_facts_html.replace('\n', '')
print(mars_facts_html_table)

<table border="1" class="dataframe mars-facts">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Data</th>    </tr>    <tr>      <th>Parameters</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-153 to 20 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>


In [10]:
#---Mars Hemispheres---

In [11]:
# Point to USGS Astrology webpage. 
mars_hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(mars_hemispheres_url)

# Scrape the browser to navigate the site and get high resolution images for each Mars hemisphere
html_browser = browser.html
mars_hemisphere = bs(html_browser, 'html.parser')

#print(mars_hemispheres.prettify())
#Get the element "div" with the imgs
div_images = mars_hemisphere.find('div', class_='collapsible results')
#print(div_images.prettify())

In [12]:
#Loop through the class="item" by clicking the h3 tag and getting the image title and its url. 

hemispheres_img_url_list = []

for img in range(len(div_images.find_all("div", class_="item"))):
    time.sleep(2)
    img_header = browser.find_by_tag('h3')
    img_header[img].click()
    html_b = browser.html
    html_soup = bs(html_b, 'html.parser')
    h2_title = html_soup.find("h2", class_="title").text
    div = html_soup.find("div", class_="downloads")
    #for li in div:
    a = div.find('a')
    href_url = a.attrs['href']
    hemispheres = {
            'img_title' : h2_title,
            'img_url' : href_url
        }
    hemispheres_img_url_list.append(hemispheres)

print(hemispheres_img_url_list)

[{'img_title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'img_title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'img_title': 'Cerberus Hemisphere Enhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'img_title': 'Valles Marineris Hemisphere Unenhanced', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_unenhanced.tif/full.jpg'}]
