-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
103 lines (72 loc) · 3.12 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from splinter import Browser
from bs4 import BeautifulSoup
import time
import pandas as pd
from urllib.parse import urlsplit
def init_browser():
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
return Browser("chrome", **executable_path, headless = False)
def scrape():
browser = init_browser()
mars_data = {}
url = "https://mars.nasa.gov/news/"
browser.visit(url)
time.sleep(1)
html = browser.html
soup = BeautifulSoup(html,"html.parser")
news_title = soup.find("div",class_="content_title").text
news_paragraph = soup.find("div", class_="article_teaser_body").text
mars_data['news_title'] = news_title
mars_data['news_paragraph'] = news_paragraph
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
browser.click_link_by_partial_text('FULL IMAGE')
try:
browser.find_by_css('.fancybox-expand').first.click()
except:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
img_location = soup.find('img', class_='fancybox-image')
img_url = img_location['src']
featured_image = f'https://www.jpl.nasa.gov{img_url}'
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
img_location = soup.find('img', class_='fancybox-image')
img_url = img_location['src']
featured_image = f'https://www.jpl.nasa.gov{img_url}'
mars_data["featured_image"] = featured_image
url_weather = "https://twitter.com/marswxreport?lang=en"
browser.visit(url_weather)
html_weather = browser.html
soup = BeautifulSoup(html_weather, "html.parser")
mars_weather = soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text
mars_data["mars_weather"] = mars_weather
url_facts = "https://space-facts.com/mars/"
time.sleep(2)
table = pd.read_html(url_facts)
table[0]
df_mars_facts = table[0]
df_mars_facts.columns = ["Parameter", "Values"]
clean_table = df_mars_facts.set_index(["Parameter"])
mars_html_table = clean_table.to_html()
mars_html_table = mars_html_table.replace("\n", "")
mars_data["facts_table"] = mars_html_table
url_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_hemispheres)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
hemispheres = soup.find_all('div', class_='item')
hemisphere_image_urls = []
for hemisphere in hemispheres:
href = hemisphere.find('a', href=True).get('href')
title = hemisphere.find('div', class_='description')
title = hemisphere.find('a').text
browser.visit(f"https://astrogeology.usgs.gov{href}")
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
img_url = soup.find('img', class_="wide-image").get('src')
img_url = f"https://astrogeology.usgs.gov{img_url}"
hemisphere_image_urls.append({"title":title,"img_url":img_url})
mars_data["hemisphere_image_urls"] = hemisphere_image_urls
browser.quit()
return mars_data