# Scraping with Pandas

In [1]:
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pymongo
import requests

### NASA Mars News
Scrape the Mars News Site (url) to collect the latest News Titles and Paragraph text. 
Save these to variables to use them later.

Set up URL
Retrieve page with splinter

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
url = 'https://redplanetscience.com/'
browser.visit(url)

In [None]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

sidebar = soup.find('div', class_="col-md-12")

categories = sidebar.find_all('div')

Examine the results, determine element that contains the title and paragraph. 

In [None]:
#print(categories)

In [None]:
titles = []
paragraphs = []

for category in categories:
    title = getattr(category.find('div', class_='content_title'),'text',None)
    titles.append(title)
    paragraph = getattr(category.find('div', class_='article_teaser_body'),'text',None)
    paragraphs.append(paragraph)
    if (title and paragraph):
    # Print results
        print('-------------')
        print(title)
        print(paragraph)

In [None]:
browser.quit()

### JPL Mars Space Images - Featured Image
Visit the Featured Space Image site (image_url)to collect images.
Use splitner to navigate to the site and find the image url for the current Featured Mars Image and assign the url string to a variable called featured_image_url.
Make sure to find the image url to the full size .jpg image.
Make sure to save a complete url string for this image: for example:
featured_image_url = 'https://spaceimages-mars.com/image/featured/mars2.jpg'

#### set up splinter

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
image_url = 'https://spaceimages-mars.com/'
browser.visit(image_url)

In [None]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
print(soup)

In [None]:
featured_image_url = soup.find('a', class_="showimg fancybox-thumbs")['href']

In [None]:
featured_image_url = image_url + featured_image_url
print(featured_image_url)

In [None]:
browser.quit()

### Mars Facts
Visit the Mars Facts webpage (facts_url) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
Use Pandas to convert the data to a HTML table string.

In [2]:
facts_url = 'https://galaxyfacts-mars.com/'

In [3]:
tables = pd.read_html(facts_url)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [4]:
#return is a list of dataframes for any tabular data that Pandas found
type(tables)

list

In [5]:
#slice off dataframes that we want using normal indexing
facts_df = tables[0]

In [6]:
#drop single header rows
facts_df.columns = ['Mars - Earth Comparison','Mars','Earth']
facts_df.head()

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"


In [7]:
facts_df.drop([0], inplace=True)

In [8]:
facts_df.set_index('Mars - Earth Comparison',inplace=True)

In [9]:
facts_df

Unnamed: 0_level_0,Mars,Earth
Mars - Earth Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [10]:
mars_profile_df = tables[1]
mars_profile_df.columns = ['Mars','Planet Profile']

In [11]:
mars_profile_df.set_index('Mars',inplace=True)
mars_profile_df

Unnamed: 0_level_0,Planet Profile
Mars,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 ( Phobos & Deimos )
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


### Mars Hemispheres
Visit the astrogeology site (astro_url) to obtain high resolution images for each of Mars's hemispheres.

- Click each of the links to the hemispheres in order to find the image url to the full resolution image.
- Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. 
- Use a Python dictionary to store the data using the keys img_url and title.
- Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

That said, in this case, you might find (hint, hint) that you're able to read all links (anchor elements) into a list. In that case, you could simply iterate over that list and click on each link.
You might find it helpful to look into the .click() method.

In [None]:
astro_url = 'https://marshemispheres.com/'

## DataFrames as HTML

#### Pandas also had a `to_html` method that we can use to generate HTML tables from DataFrames.

In [None]:
html_table = df.to_html()
html_table

#### You may have to strip unwanted newlines to clean up the table.

In [None]:
html_table.replace('\n', '')

You can also save the table directly to a file.

In [None]:
df.to_html('table.html')

In [None]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html