# LASCO CME Catalog Analysis Notebook
This notebook is for analyzing current LASCO CME catalog to look for differences in CMEs.

Want to understand estimated durations of CME events and build a histogram of this 
data.

In [1]:
import requests
import pandas as pd

In [2]:
# pull the cme catalog to local disk
cme_catalog_url = 'https://cdaw.gsfc.nasa.gov/CME_list/UNIVERSAL/text_ver/univ_all.txt'
response = requests.get(cme_catalog_url, allow_redirects=True)
response

<Response [200]>

In [3]:
# cache locally, just in case..
open('../lasco_catalog.txt', 'wb').write(response.content)

3567412

In [4]:
# parse out data into string lines from byte block
raw_lines = response.content.decode('utf-8').split('\n')

In [5]:
#snag field names from top of file. Its not entirely correct, 
# but 'good enough' for now
fields = raw_lines[1].split()
fields

['Date',
 'Time',
 'Central',
 'Width',
 'Linear',
 '2nd',
 'order',
 'speed',
 'Accel',
 'Mass',
 'Kinetic',
 'MPA',
 'Remarks']

In [6]:
# parse remainder of lines into dataframe using regex pattern
# only date and time fields appear to be consistently same format,
# we have to guard against non-numbers in all of the other columns
import re

pattern_str = r"^(\d{4}/\d{2}/\d{2})\s+(\d{2}:\d{2}:\d{2})\s+(\S+)\s+(\S+)\s+(\S+)\s+" +\
              r"(\S+)\s+(\S+)\s+(\S+)\s+(\S+?)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S[\S|\s]+)?$"
pattern = re.compile(pattern_str)

data_arr = { fld:[] for fld in fields }
# parse remainder into columns
for line in raw_lines[4:]:
    m = pattern.match(line)
    if m:
        for grp in range(1,len(m.groups())+1): 
            #print(f"{grp}: %s" % m.group(grp))
            data_arr[fields[grp-1]].append(m.group(grp))
    else:
        print (f"Warning: cannot parse line:\n  {line}")
        
data = pd.DataFrame(data_arr)
data

  2001/10/06  18:06:05    100    104    296     275    318    319      1.1    4.3e+15    1.9e+30     74   ]
  2014/11/23  16:12:05     94     57    601     548    661    647      5.0    1.3e+15    2.3e+30     96   \
  2016/12/30  02:24:05    232     25    333     321    346    354      1.0    9.8e+14    5.5e+29    231   n
  


Unnamed: 0,Date,Time,Central,Width,Linear,2nd,order,speed,Accel,Mass,Kinetic,MPA,Remarks
0,1996/01/11,00:14:36,267,18,499,571,426,0,-64.3*,-------,-------,272,Only C3
1,1996/01/13,22:08:30,265,16,290,278,303,372,2.8*,-------,-------,266,Only C3
2,1996/01/15,07:01:10,262,43,525,600,454,0,-31.1,-------,-------,272,Only C3
3,1996/01/22,03:11:01,105,37,267,401,130,0,-126.3*,7.1e+13*,2.5e+28*,103,Very Poor Event; Only C3
4,1996/01/26,09:16:19,90,27,262,254,271,322,1.9*,3.0e+14,1.0e+29,90,Poor Event; Only C3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30313,2020/05/29,11:24:05,84,52,347,406,288,0,-21.9*,-------,-------,68,Poor Event; Only C2
30314,2020/05/29,15:12:05,83,46,204,215,193,0,-2.8*,-------,-------,74,Poor Event; Only C2
30315,2020/05/29,17:36:05,76,34,210,295,120,0,-19.3*,-------,-------,73,Poor Event; Only C2
30316,2020/05/30,01:25:43,80,39,269,203,339,331,2.8,-------,-------,86,


In [7]:
# sadly, we have all object types because of poor measurements recorded as non-numbers
#
data.dtypes

Date       object
Time       object
Central    object
Width      object
Linear     object
2nd        object
order      object
speed      object
Accel      object
Mass       object
Kinetic    object
MPA        object
Remarks    object
dtype: object