Project 1 - Space Weather Reporting

Imports for libraries to be used in this project.

In [35]:
#html
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By 
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

#data
import pandas as pd

import numpy as np

import datetime

Part 1: Data Scraping and Preparation

Use selenium webdriver to get the URL.

In [36]:
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get("https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares.html")
    html = driver.page_source
    driver.quit()
except HTTPError as e:
    print(e)

Extract the text from the page. Then use BeautifulSoup to read and parse the data, either as html or lxml.

In [37]:
soup = BeautifulSoup(html, 'html.parser')

Use prettify( ) to view the content and find the appropriate table.

In [38]:
print(soup.prettify())

<html data-bs-theme="light" lang="en">
 <head>
  <title>
   Top 50 solar flares | Solar activity | SpaceWeatherLive.com
  </title>
  <meta charset="utf-8"/>
  <meta content="index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1" name="robots"/>
  <meta content="On this page you will find an overview of the strongest solar flares since June 1996 together with links to more information in our archive and a v..." name="description"/>
  <meta content="SpaceWeatherLive, Live, Aurora, Auroral activity, Aurora Australis, Aurora Borealis, northern lights, Solar wind, Kp-index, Space Weather, Space Weather Updates, Aurora forecast, Space Weather Alerts, Solar activity, Solar flares, Sunspots, Aurora alert, Auroral activity, The Sun, SDO, STEREO, EPAM, DSCOVR" name="keywords"/>
  <!-- Facebook meta -->
  <meta content="https://spaceweatherlive.com/en/solar-activity/top-50-solar-flares.html" property="og:url"/>
  <meta content="article" property="og:type"/>
  <meta content=

Use find to save table as a variable.

In [39]:
table = soup.find('table')

Use pandas to read in the HTML file into a dataframe.

In [40]:
df = pd.read_html(str(table), header=0, flavor='lxml')[0]
columns = ['Rank', 'X_Class', 'Date', 'Region', 'Start', 'Maximum', 'Stop', 'View_archive'] #column names could be added for the empty ones but idk what the x40+ even means
df.columns = columns
df

  df = pd.read_html(str(table), header=0, flavor='lxml')[0]


Unnamed: 0,Rank,X_Class,Date,Region,Start,Maximum,Stop,View_archive
0,1,X40+,2003/11/04,486,19:29,19:53,20:06,View archive
1,2,X28.57+,2001/04/02,9393,21:32,21:51,22:03,View archive
2,3,X24.57+,2003/10/28,486,09:51,11:10,11:24,View archive
3,4,X24.42+,2005/09/07,808,17:17,17:40,18:03,View archive
4,5,X20.67+,2001/04/15,9415,13:19,13:50,13:55,View archive
5,6,X14.36,2003/10/29,486,20:37,20:49,21:01,View archive
6,7,X13.37,2017/09/06,2673,11:53,12:02,12:10,View archive
7,8,X12.97,1997/11/06,8100,11:49,11:55,12:01,View archive
8,9,X12.95,2006/12/05,930,10:18,10:35,10:45,View archive
9,10,X11.96,2003/11/02,486,17:03,17:25,17:39,View archive


Part 2 - Tidy up the top 50 solar flare data

Drop the last columnn of the dataframe

In [41]:
df = df[df.columns[:-1]]
df

Unnamed: 0,Rank,X_Class,Date,Region,Start,Maximum,Stop
0,1,X40+,2003/11/04,486,19:29,19:53,20:06
1,2,X28.57+,2001/04/02,9393,21:32,21:51,22:03
2,3,X24.57+,2003/10/28,486,09:51,11:10,11:24
3,4,X24.42+,2005/09/07,808,17:17,17:40,18:03
4,5,X20.67+,2001/04/15,9415,13:19,13:50,13:55
5,6,X14.36,2003/10/29,486,20:37,20:49,21:01
6,7,X13.37,2017/09/06,2673,11:53,12:02,12:10
7,8,X12.97,1997/11/06,8100,11:49,11:55,12:01
8,9,X12.95,2006/12/05,930,10:18,10:35,10:45
9,10,X11.96,2003/11/02,486,17:03,17:25,17:39


Use datetime import to combine the date and each of the three time columns into three datetime columns

In [42]:
for index, row in df.iterrows():
    df['Start_DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Start'])
    df['Max_DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Maximum'])
    df['Stop_DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Stop'])

df.drop(['Date', 'Start', 'Maximum', 'Stop'], axis=1, inplace=True)

df = df[['Rank', 'X_Class', 'Start_DateTime', 'Max_DateTime', 'Stop_DateTime']]
df

Unnamed: 0,Rank,X_Class,Start_DateTime,Max_DateTime,Stop_DateTime
0,1,X40+,2003-11-04 19:29:00,2003-11-04 19:53:00,2003-11-04 20:06:00
1,2,X28.57+,2001-04-02 21:32:00,2001-04-02 21:51:00,2001-04-02 22:03:00
2,3,X24.57+,2003-10-28 09:51:00,2003-10-28 11:10:00,2003-10-28 11:24:00
3,4,X24.42+,2005-09-07 17:17:00,2005-09-07 17:40:00,2005-09-07 18:03:00
4,5,X20.67+,2001-04-15 13:19:00,2001-04-15 13:50:00,2001-04-15 13:55:00
5,6,X14.36,2003-10-29 20:37:00,2003-10-29 20:49:00,2003-10-29 21:01:00
6,7,X13.37,2017-09-06 11:53:00,2017-09-06 12:02:00,2017-09-06 12:10:00
7,8,X12.97,1997-11-06 11:49:00,1997-11-06 11:55:00,1997-11-06 12:01:00
8,9,X12.95,2006-12-05 10:18:00,2006-12-05 10:35:00,2006-12-05 10:45:00
9,10,X11.96,2003-11-02 17:03:00,2003-11-02 17:25:00,2003-11-02 17:39:00


Replace any '-' with NaN

In [44]:
df.replace('-', np.nan)

Unnamed: 0,Rank,X_Class,Start_DateTime,Max_DateTime,Stop_DateTime
0,1,X40+,2003-11-04 19:29:00,2003-11-04 19:53:00,2003-11-04 20:06:00
1,2,X28.57+,2001-04-02 21:32:00,2001-04-02 21:51:00,2001-04-02 22:03:00
2,3,X24.57+,2003-10-28 09:51:00,2003-10-28 11:10:00,2003-10-28 11:24:00
3,4,X24.42+,2005-09-07 17:17:00,2005-09-07 17:40:00,2005-09-07 18:03:00
4,5,X20.67+,2001-04-15 13:19:00,2001-04-15 13:50:00,2001-04-15 13:55:00
5,6,X14.36,2003-10-29 20:37:00,2003-10-29 20:49:00,2003-10-29 21:01:00
6,7,X13.37,2017-09-06 11:53:00,2017-09-06 12:02:00,2017-09-06 12:10:00
7,8,X12.97,1997-11-06 11:49:00,1997-11-06 11:55:00,1997-11-06 12:01:00
8,9,X12.95,2006-12-05 10:18:00,2006-12-05 10:35:00,2006-12-05 10:45:00
9,10,X11.96,2003-11-02 17:03:00,2003-11-02 17:25:00,2003-11-02 17:39:00
