In [None]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import requests
import pymongo
from flask import Flask, render_template, redirect
from flask_pymongo import PyMongo
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

# Scraping the surface of Mars

In [None]:
# Naming url for scraping

mars_news = 'https://mars.nasa.gov/news/'

# link to url named

response = requests.get(mars_news)

html = response.text

mars_soup = BeautifulSoup(html, 'html.parser')
mars_soup

In [None]:
# Scraping of the Content Titles
mars_results = mars_soup.find_all('div', class_ = "content_title")

mars_results

In [None]:
# List for the titles
mars_titles = []

# Running a for loop to deposit titles in the list

for result in mars_results:
    # Identifying the anchor
    if (result.a):
        # There must be text
        if (result.a.text):
            # Appending title to list
            mars_titles.append(result)
            
mars_titles

In [None]:
# Cleaning text to only show titles, stripping the HTML coding syntax
stripped_mars_titles = []

for x in range(len(mars_titles)):
    temp = mars_titles[x].text
    new_temp = temp.strip('\n\n')
    stripped_mars_titles.append(new_temp)
        
stripped_mars_titles

In [None]:
# Now to grab the sub text for the titles

para_results = mars_soup.find_all('div', class_ = "rollover_description_inner")

para_results

In [None]:
# List for cleaned up text
stripped_paras = []

for y in range(len(para_results)):
    temp = para_results[y].text
    newtemp = temp.strip('\n\n')
    stripped_paras.append(newtemp)
    
stripped_paras

# Time for the Image!

In [None]:
# Assigning url for image capture
urlI = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'

response = requests.get(urlI)

html = response.text

mars_image = BeautifulSoup(html, 'html.parser')

In [None]:
# Making html more understandable

print(mars_image.prettify())

In [None]:
# locating images
images = mars_image.find_all('a', class_ = "showimg fancybox-thumbs")
images

In [None]:
# assigning image specific path
pic = "image/featured/mars1.jpg"

# creating url for image    
feature_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + pic
feature_image_url

# Grabbing Table Information

In [None]:
url = 'https://space-facts.com/mars/'
response3 = requests.get(url)
soup = BeautifulSoup(response3.text, 'html.parser')

In [None]:
# Pulling table info
mars_tables = pd.read_html(url)
mars_tables[0]

In [None]:
# transforming to dataframe for alterations
mars_df = mars_tables[0]
mars_df

In [None]:
# renaming columns
mars_df.columns = ['Statistic', 'Measurement']
mars_df

In [None]:
# stripping out the :
mars_ser = pd.Series(mars_df['Statistic'])
mars_df['Statistic'] = mars_ser.str.strip(':')
mars_df

In [None]:
# setting Statistic as the index
mars_df = mars_df.set_index('Statistic')
mars_df

In [None]:
# putting df back into html table
html_mars_table = mars_df.to_html()
html_mars_table

In [None]:
# saving table
mars_df.to_html('mars_html_table.html')

# Gathering hemispherical images

In [None]:
# Setting up windows browser with chromedriver
driver = webdriver.Chrome(ChromeDriverManager().install())
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Setting url for alternate browser
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [None]:
# Finally got to the url for the images
nextpage_urls = []
imgtitles = []
base_url = 'https://astrogeology.usgs.gov'

# HTML object
html = browser.html
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
# Retrieve all elements that contain hemisphere photo info
divs = soup.find_all('div', class_='description')

# Iterate through each div to pull titles and make list of hrefs to iterate through
counter = 0
for div in divs:
        # Use Beautiful Soup's find() method to navigate and retrieve attributes
    link = div.find('a')
    href=link['href']
    img_title = div.a.find('h3')
    img_title = img_title.text
    imgtitles.append(img_title)
    next_page = base_url + href
    nextpage_urls.append(next_page)
    counter = counter+1
    if (counter == 4):
        break
print(nextpage_urls)
print(imgtitles)

In [None]:
# Loop to pull images

the_images = []
for nextpage_url in nextpage_urls:
    url = nextpage_url
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    link2 = soup.find('img', class_ = "wide-image")
    finals = link2['src']
    full_image = base_url + finals
    the_images.append(full_image)
    nextpage_urls = []
    
the_images

In [None]:
# Creating final list of dictionaries
# values - imgtitles and my_images
#ckeys- img_url and title
hemisphere_image_urls = []

cerberus = {'title':imgtitles[0], 'img_url': the_images[0]}
schiaparelli = {'title':imgtitles[1], 'img_url': the_images[1]}
syrtis = {'title':imgtitles[2], 'img_url': the_images[2]}
valles = {'title':imgtitles[3], 'img_url': the_images[3]}

hemisphere_image_urls = [cerberus, schiaparelli, syrtis, valles]
print(hemisphere_image_urls)