# Notebook to develop and test the scraping functions that will be used in the app

In [93]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

## Set up each scraping activity as a function to make it easier to port into the app

In [94]:
def get_last_news():
    url_base = "https://redplanetscience.com/"
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url_base)
    time.sleep(5)
    
    html = browser.html
    soup = bs(html, "html.parser")
    news = soup.find('div',id='news')
    last = news.find("div", class_ = 'list_text')
    title = last.find("div", class_ = 'content_title').text 
    para = last.find("div", class_ = 'article_teaser_body').text 
    list_date = last.find("div", class_ = 'list_date').text 
    
    browser.quit() 

    return list_date, title, para

In [95]:
def get_mars_image():
    url_base = "https://spaceimages-mars.com/"
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url_base)
    time.sleep(5)
    
    html = browser.html
    soup = bs(html, "html.parser")
    header=soup.find('img', class_="headerimage")
    url = url_base + header['src']
    browser.quit()
    return url

In [96]:
def get_mars_table(file_name):     
    url = "https://galaxyfacts-mars.com/"
    match = "Equatorial Diameter"
    df_list = pd.read_html(url,match=match)
    df_list[0].to_html(file_name,index=False,header=False,classes=["table", "table-striped"])
    return 

In [102]:
def get_hemispheres():
    url_base = "https://marshemispheres.com/index.html"
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    
    # open the landing page 
    browser.visit(url_base)
    time.sleep(5)

    html = browser.html
    soup = bs(html, "html.parser")

    # get the list of links to visit and the hemisphere names
    link_list = []
    title_list = []
    image_list = []
    for desc in soup.find_all('div',class_="description"):
        for link in desc.find_all('a'):
            if link.has_attr('href'):
                link_list.append(link['href'])
                long_title = link.find('h3').contents[0]
                title = long_title[0:long_title.find(" Enhanced")]
                title_list.append(title)

    # loop over the pages to get the url for each of the images 
    for link in link_list:
        url = "https://marshemispheres.com/" + link 
        browser.visit(url)
        time.sleep(5)
        
        html_2 = browser.html
        soup_2 = bs(html_2, "html.parser")
        div = soup_2.find('div', class_="downloads")
        a_list = div.find_all('a')
        for a in a_list:
            if a.contents[0] == "Original":
                image_list.append("https://marshemispheres.com/"+ a['href'])

    # combine the lists into dictionaries and return 
    ret_list = []
    for ia in range(len(title_list)):
        hemisphere = {"title":title_list[ia], "img_url":image_list[ia]}
        ret_list.append(hemisphere)
    
    browser.quit()         
    return  ret_list          

## Test that the functions work  

In [104]:

results = get_hemispheres() 
for hemisphere in results:
    print(hemisphere)



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\alans\.wdm\drivers\chromedriver\win32\92.0.4515.43\chromedriver.exe] found in cache


{'title': 'Cerberus Hemisphere', 'img_url': 'https://marshemispheres.com/images/cerberus_enhanced.tif'}
{'title': 'Schiaparelli Hemisphere', 'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced.tif'}
{'title': 'Syrtis Major Hemisphere', 'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced.tif'}
{'title': 'Valles Marineris Hemisphere', 'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced.tif'}


In [105]:
out_file = "test.html"
get_mars_table(out_file)

In [106]:

url = get_mars_image() 
print(f"Mars image = {url}")



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\alans\.wdm\drivers\chromedriver\win32\92.0.4515.43\chromedriver.exe] found in cache


Mars image = https://spaceimages-mars.com/image/featured/mars2.jpg


In [108]:
list_date,title,para = get_last_news()     
print(f"title = {title}")
print(f"text = {para}")
print(f"date = {list_date}")



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\alans\.wdm\drivers\chromedriver\win32\92.0.4515.43\chromedriver.exe] found in cache


title = NASA Mars Mission Connects With Bosnian and Herzegovinian Town
text = A letter from NASA was presented to the mayor of Jezero, Bosnia-Herzegovina, honoring the connection between the town and Jezero Crater, the Mars 2020 rover landing site.
date = July 28, 2021
