# Scraping with Pandas

In [None]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pymongo
import requests

### NASA Mars News
Scrape the Mars News Site (url) to collect the latest News Titles and Paragraph text. 
Save these to variables to use them later.

Set up URL
Retrieve page with splinter

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
url = 'https://redplanetscience.com/'
browser.visit(url)

In [None]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

sidebar = soup.find('div', class_="col-md-12")
categories = sidebar.find_all('div')

Examine the results, determine element that contains the title and paragraph. 

In [None]:
#print(categories)

In [None]:
titles = []
paragraphs = []

for category in categories:
    title = getattr(category.find('div', class_='content_title'),'text',None)
    titles.append(title)
    paragraph = getattr(category.find('div', class_='article_teaser_body'),'text',None)
    paragraphs.append(paragraph)
    if (title and paragraph):
    # Print results
        print('-------------')
        print(title)
        print(paragraph)

In [None]:
browser.quit()

### JPL Mars Space Images - Featured Image
Visit the Featured Space Image site (image_url)to collect images.
Use splitner to navigate to the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.
Make sure to find the image url to the full size .jpg image.
Make sure to save a complete url string for this image: for example:
featured_image_url = 'https://spaceimages-mars.com/image/featured/mars2.jpg'

#### set up splinter

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
image_url = 'https://spaceimages-mars.com/'
browser.visit(image_url)

In [None]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
print(soup)

In [None]:
featured_image_url = soup.find('a', class_="showimg fancybox-thumbs")['href']

In [None]:
featured_image_url = image_url + featured_image_url
print(featured_image_url)

In [None]:
browser.quit()

### Mars Facts
Visit the Mars Facts webpage (facts_url) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
Use Pandas to convert the data to a HTML table string.

In [None]:
facts_url = 'https://galaxyfacts-mars.com/'

In [None]:
tables = pd.read_html(facts_url)
tables

In [None]:
#return is a list of dataframes for any tabular data that Pandas found
type(tables)

In [None]:
#slice off dataframes that we want using normal indexing
facts_df = tables[0]

In [None]:
#drop single header rows
facts_df.columns = ['Mars - Earth Comparison','Mars','Earth']
facts_df.head()

In [None]:
facts_df.drop([0], inplace=True)

In [None]:
facts_df.set_index('Mars - Earth Comparison',inplace=True)

In [None]:
facts_df

In [None]:
mars_profile_df = tables[1]
mars_profile_df.columns = ['Mars','Planet Profile']

In [None]:
mars_profile_df.set_index('Mars',inplace=True)
mars_profile_df

#### Use pandas  `to_html` method to generate HTML tables from DataFrames. 

In [None]:
facts_html_table = facts_df.to_html()
profile_html_table = mars_profile_df.to_html()

#### Strip unwanted newlines to clean up the table.

In [None]:
profile_html_table.replace('\n', '')
facts_html_table.replace('\n', '')

### Mars Hemispheres
Visit the astrogeology site (astro_url) to obtain high resolution images for each of Mars's hemispheres.

- Click each of the links to the hemispheres in order to find the image url to the full resolution image.
- Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. 
- Use a Python dictionary to store the data using the keys img_url and title.
- Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
astro_url = 'https://marshemispheres.com/'
browser.visit(astro_url)

In [None]:
html = browser.html
soup = BeautifulSoup(html, 'lxml')

In [None]:
images = []
titles = []

for link in soup.find_all('a', class_='itemLink product-item'):
    
    image = link.get('href')
    images.append(image)
    
    title = link.find('h3')
    titles.append(title)

In [None]:
#iterate over the link list using .click() to pull the high res image url from each link

In [None]:
browser.quit()