## Scraping

#### NASA Mars News
* Collect the latest news titles and paragraph texts.

In [None]:
# Import libraries
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd

In [None]:
# Conection to Chrome browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Read URL
url_news = 'https://mars.nasa.gov/news/'
browser.visit(url_news)

In [None]:
# Get HTML from URL
html_news = browser.html
# Parse HTML with Beautiful Soup
soup_news = bs(html_news, 'html.parser')

In [None]:
# Already inspected the webpage so
# Retrieve elements like Mars news titles
mars_news = soup_news.find_all('div', class_='list_text')

In [None]:
# Iterate through each title and save it in lists
news_title = []
news_title_prg = []
for title in mars_news:
    tag = title.find('a')
    header = tag.next
    prg = tag.next.next.text
    news_title.append(header)
    news_title_prg.append(prg)

In [None]:
print(news_title[0])
print(news_title_prg[0])

#### JPL Mars Space Images - Featured Image
* Find URL for the latest Featured Mars Image

In [None]:
# URL
url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url_jpl)

In [None]:
# Get HTML and parse it with BS
html_jpl = browser.html
soup_jpl = bs(html_jpl, "html.parser")

In [None]:
# Inspect the link
img_tag = soup_jpl.find('div', class_='img').find('img')

In [None]:
# Get the latest image from Mars 
img_source = img_tag['src']
img_title = img_tag['title']

In [None]:
# Save the link 
featured_image_url = 'https://www.jpl.nasa.gov' + img_source
print(featured_image_url)

#### Mars Weather
* Scraping the latest Mars weather tweet.

In [None]:
# Link to Twitter
url_tw = 'https://twitter.com/marswxreport'
browser.visit(url_tw)

In [None]:
# Parse HTML with BS
html_tw = browser.html
soup_tw = bs(html_tw, "html.parser")

In [None]:
# Get latest twit
tweets = soup_tw.find_all(class_="css-901oao r-jwli3a r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")


In [None]:
# Sometimes the latest tweet is not about the Mars Weather, 
# so the last four tweets are inspected
list_mars_weather = []
for i in range(len(tweets)):
        weather = tweets[i].text
        #print(weather)
        if weather[:7] == 'InSight':
            list_mars_weather.append(weather)
            break
        else:
            continue

In [None]:
list_mars_weather

#### Mars Facts Table
* Table containing facts about the planet using the Mars Facts webpage.

In [None]:
# URL
url_table = 'https://space-facts.com/mars/'
mars_table = pd.read_html(url_table) 

In [None]:
# Pandas read two tables, only get one
mars_table_one = mars_table[0]
mars_table_one.columns = ['Parameter', 'Value']
mars_table_one

In [None]:
# Export as a HTML file
mars_table_one.to_html('mars_facts/mars_facts.html', index=False, justify='center')

#### Mars hemispheres
* High resolution images for each of Mar's hemispheres.

In [None]:
# URL
url_hmp = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_hmp)

In [None]:
# Parse HTML file with BS
html_hmp = browser.html
soup_hmp = bs(html_hmp, "html.parser")

In [None]:
# Getting the image titles
img_hmp = soup_hmp.find_all('a', class_='itemLink product-item')

In [None]:
# Concatenate the main URL with the respective image URL
# The links were found manually
url_img = 'https://astropedia.astrogeology.usgs.gov/'
img_src_cer = url_img + 'download/Mars/Viking/cerberus_enhanced.tif/full.jpg'
img_src_sch = url_img + 'download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'
img_src_syr = url_img + 'download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'
img_src_val = url_img + 'download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'

In [None]:
# Save image links
imgs_hmp = [{"title_hmp1": img_hmp[1].find('h3').text, "src_hmp1": img_src_cer},
    {"title_hmp2": img_hmp[3].find('h3').text, "src_hmp2": img_src_sch},
    {"title_hmp3": img_hmp[5].find('h3').text, "src_hmp3": img_src_syr},
    {"title_hmp4": img_hmp[7].find('h3').text, "src_hmp4": img_src_val}]

#### Gather all data

In [None]:
# List to save info
mars_data_total = []

In [None]:
# Only ten news
for i in range(10):
    mars_data_total.append({'title': news_title[i], 'prgph': news_title_prg[i]})

In [None]:
# Append hemisphere images
mars_data_total.append(imgs_hmp)

In [None]:
# Append weather tweet
mars_data_total.append(list_mars_weather)

In [None]:
# Append Mars image
mars_data_total.append({'img_title': img_title, 'img_src': featured_image_url})

In [None]:
# Mars data total
mars_data_total