# Deliverable 1: web scraping the Mars News
- Scrape titles and preview text from Mars news articles.
- Optionally export the data into a JSON file or a MongoDB database. 

In [1]:
# import dependencies
import pandas as pd
import numpy as np
from splinter import Browser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
# import below when using Chrome browser
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
import time
import json
import os
# set local chromedriver
chromedrv = os.path.expanduser('~/ucbDA/chromedriver_win32/chromedriver')

## Method 1: splinter's executable_path

In [3]:
# Set some default options for chrome browser
options = webdriver.ChromeOptions()
options.add_argument("--lang=en")
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
# Set up splinter (PS: executable_path has deprecated)
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False, options=options)
# Visit a site defined in the url
def geturl(url):
    browser.visit(url)
    # Optional delay for loading the page (unit: seconds)
    browser.is_element_present_by_css('div.list_text', wait_time=1)

In [4]:
# Visit the Mars NASA news site
geturl('https://redplanetscience.com')

In [5]:
# Parse the HTML
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [6]:
# Use the parent element to find the news article title
title = slide_elem.find('div', class_='content_title').text

In [7]:
# Use the parent element to find the paragraph text
preview = slide_elem.find('div', class_='article_teaser_body').text

In [8]:
# dict
news_list = [{'title': title, 'preview': preview}]
news_list

[{'title': 'NASA Invites Public to Share Excitement of Mars 2020 Perseverance Rover Launch',
  'preview': 'There are lots of ways to participate in the historic event, which is targeted for July 30.'}]

In [9]:
# export the Python list/dict into a JSON file
outfile = './Data/mars_data.json'
with open(outfile, 'w', encoding='utf-8') as f:
    json.dump(news_list, f, ensure_ascii=False, indent=4)
# close file (optional)
f.close()

In [10]:
# Verify the json file
infile = open(outfile, 'r', encoding='utf-8')
mars_data = json.load(infile)
mars_data

[{'title': 'NASA Invites Public to Share Excitement of Mars 2020 Perseverance Rover Launch',
  'preview': 'There are lots of ways to participate in the historic event, which is targeted for July 30.'}]

In [11]:
browser.quit()

## Method 2: selenium's webdriver

In [12]:
# Set some default options for chrome browser
options = webdriver.ChromeOptions()
options.add_argument("--lang=en")
options.add_argument("--start-maximized")
options.add_argument("--disable-notifications")
# Set up splinter (selenium 4)
driver = webdriver.Chrome(service=Service(chromedrv), options=options)
# Visit a site defined in the url
def geturl(url):
    driver.get(url)
    # Optional delay for loading the page (unit: seconds)
    driver.implicitly_wait(1)

In [13]:
# Visit the Mars NASA news site
geturl('https://redplanetscience.com')

In [14]:
# Parse the HTML (selenium 4)
html = driver.page_source
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [15]:
# Use the parent element to find the news article title
title = slide_elem.find('div', class_='content_title').text

In [16]:
# Use the parent element to find the paragraph text
preview = slide_elem.find('div', class_='article_teaser_body').text

In [17]:
# dict
news_list = [{'title': title, 'preview': preview}]
news_list

[{'title': "HiRISE Views NASA's InSight and Curiosity on Mars",
  'preview': 'New images taken from space offer the clearest orbital glimpse yet of InSight as well as a view of Curiosity rolling along.'}]

In [18]:
# export the Python list/dict into a JSON file
outfile = './Data/mars_data.json'
with open(outfile, 'w', encoding='utf-8') as f:
    json.dump(news_list, f, ensure_ascii=False, indent=4)
# close file (optional)
f.close()

In [19]:
# Verify the json file
infile = open(outfile, 'r', encoding='utf-8')
mars_data = json.load(infile)
mars_data

[{'title': "HiRISE Views NASA's InSight and Curiosity on Mars",
  'preview': 'New images taken from space offer the clearest orbital glimpse yet of InSight as well as a view of Curiosity rolling along.'}]

### Create a Mongo Database
```
mongoimport --type json -d mars_data -c news_list --drop --jsonArray mars_data.json
```

In [20]:
from pymongo import MongoClient
# Create an instance of MongoClient
mongo = MongoClient(port=27017)
mongo.list_database_names()

['admin', 'config', 'local', 'mars_data', 'my_db1', 'petsitly_marketing']

In [21]:
mars_db = mongo['mars_data']
mars_db.list_collection_names()

['news_list', 'customer_list']

In [22]:
mars_db = mongo['mars_data']
mars_db['news_list'].find_one()

{'_id': ObjectId('6369157d61942092889c405c'),
 'title': "NASA's Curiosity Takes Selfie With 'Mary Anning' on the Red Planet",
 'preview': 'The Mars rover has drilled three samples of rock in this clay-enriched region since arriving in July.'}

In [23]:
driver.quit()