# CDAW CME Catalog Creation Notebook
This notebook is for parsing the online version and creating a CDAW CME catalog we may use. Changes include: we'll calculate duration of events in C2, C3 detectors and write as CSV.

In [1]:
import requests
import pandas as pd

Need_To_Redownload = False

CDAW_Catalog_Filename = '../cdaw_catalog.txt'

All_CME_Catalog_Filename = '../all_cme_event_catalog.csv'  # this one is cleaned of weak/poor CME
CME_Catalog_Filename = '../cme_event_catalog.csv'  # this one is cleaned of weak/poor CME
Raw_CME_Catalog_Filename = '../raw_cme_event_catalog.csv' # this one is uncleaned and useful for calculation of non-CME time intervals

In [2]:
# pull the cme catalog to local disk
if Need_To_Redownload:
    
    cme_catalog_url = 'https://cdaw.gsfc.nasa.gov/CME_list/UNIVERSAL/text_ver/univ_all.txt'
    response = requests.get(cme_catalog_url, allow_redirects=True)
    
    # cache locally
    open(CDAW_Catalog_Filename, 'wb').write(response.content)

    # parse out data into string lines from byte block
    cdaw_raw_lines = response.content.decode('utf-8').split('\n')
    
    # download LASCO catalog now
    
    #lasco_hdr_url = 'https://lasco-www.nrl.navy.mil/lz/img_hdr.txt'   # level 0.5 data
    #lasco_hdr_url = 'https://lasco-www.nrl.navy.mil/lz/level_1/img_hdr.txt'
    #response = requests.get(lasco_hdr_url, allow_redirects=True)
    
    # cache locally
    # LASCO_Catalog_Filename = '../lasco_img_catalog_lvl_1.txt'
    # open(LASCO_Catalog_Filename, 'wb').write(response.content)

else:
    
    with open(CDAW_Catalog_Filename, 'r') as f:
        cdaw_raw_lines = f.readlines()
        
    print(f"Read %s CDAW lines" % len(cdaw_raw_lines))
        

Read 31632 CDAW lines


In [3]:
# parse remainder of lines into dataframe using regex pattern
# only date and time fields appear to be consistently same format,
# we have to guard against non-numbers in all of the other columns
def parse_data(cdaw_raw_lines:list)->pd.DataFrame:
    
    import re

    pattern_str = r"^(\d{4}/\d{2}/\d{2})\s+(\d{2}:\d{2}:\d{2})\s+(\S+)\s+(\S+)\s+(\S+)\s+" +\
                  r"(\S+)\s+(\S+)\s+(\S+)\s+(\S+?)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S[\S|\s]+)?$"
    pattern = re.compile(pattern_str)

    #snag field names from top of file. Its not entirely correct, 
    # but 'good enough' for now
    fields = cdaw_raw_lines[1].split()

    data_arr = { fld:[] for fld in fields }
    # parse remainder into columns
    for line in cdaw_raw_lines[4:]:
        m = pattern.match(line)
        if m:
            for grp in range(1,len(m.groups())+1): 
                #print(f"{grp}: %s" % m.group(grp))
                data_arr[fields[grp-1]].append(m.group(grp))
        else:
            print (f"Warning: cannot parse line:\n  {line}")

    raw_data = pd.DataFrame(data_arr)
    
    return raw_data

In [4]:
# Data Cleaning: filter out Bad/Poor data: Missing Linear speed ('------') and
# 'Very Poor Event' and 'None' in Remarks
#
def clean_data(data:pd.DataFrame)->pd.DataFrame:
    data = raw_data[raw_data['Linear'] != '----']
    #data = data[data.Remarks.str.contains('Very Poor Event') == False]
    data = data[data.Remarks.str.contains('None') == False]
    return data

In [5]:
# fix up data by getting rid of non-numeric chars in some columns, also
# explicitly cast as numbers
def fix_data_format (data:pd.DataFrame)->pd.DataFrame:
    data['width'] = pd.to_numeric(data['Width'])
    #data['central'] = pd.to_numeric(data['Central'])
    
    lspeed = []
    for val in data['Linear']:
        if val == '----': 
            val = 1.e-20

        lspeed.append(val)
    
    data['linear'] = pd.to_numeric(lspeed) # data['Linear'])
    data['mpa'] = pd.to_numeric(data['MPA'])
    #data['cpa_mpa'] = data['central'] - data['mpa']
    data['mass'] = pd.to_numeric([i.replace('*', '').replace('-','0') for i in data['Mass']])
    data['kinetic'] = pd.to_numeric([i.replace('*', '').replace('-','0') for i in data['Kinetic']])
    data['accel'] = pd.to_numeric([i.replace('*', '').replace('--','0') for i in data['Accel']])
    #data[['accel', 'Accel', 'mass', 'Mass','kinetic']]
    return data

### calculate durations for each detector (C2 1.5 - 6 Rsol and C3 is 3.7 - 30 Rsol)

#### T_c2_start == catalog time_start

Our formula for duration:
  700,000 * Det_max_Rsol /linear speed = T_duration_det

#### C2 Duration : 4200000 / speed = T_c2
 
C3 Start time delta (from C2) is C2_Start + (3.7 - 1.5) * R_sol
or
2.2 * R_sol  = T_c3_start_delta

#### C3 Delta Start is 1540000 + T_c2_start

C3 duration is (30 - 3.7) * R_sol / linear speed

#### C3 Duration: 18410000 / speed = T_c3

In [6]:
def add_durations(data:pd.DataFrame)->pd.DataFrame:
    c2_durations = [ 4200000./speed for speed in data['linear']]
    c3_durations = [ 18410000./speed for speed in data['linear']]
    # delta is C3 start time (est) after event start (at Sun).
    c3_start_delta = [1540000./speed for speed in data['linear']]
    
    # add durations to the catalog
    data['Duration_C2'] = c2_durations
    data['Duration_C3'] = c3_durations
    data['StartTime_C3_Delta'] = c3_start_delta
    
    return data

def fix_data(data:pd.DataFrame)->pd.DataFrame:
    
    data = fix_data_format(data)
    return add_durations(data)


## Run Pipeline

In [7]:
# run the pipeline
raw_data = parse_data(cdaw_raw_lines)
cleaned_data = clean_data(raw_data)
all_data = fix_data(cleaned_data)
all_uncleaned_data = fix_data(raw_data)
all_data[:1]

Unnamed: 0,Date,Time,Central,Width,Linear,2nd,order,speed,Accel,Mass,...,Remarks,width,linear,mpa,mass,kinetic,accel,Duration_C2,Duration_C3,StartTime_C3_Delta
0,1996/01/11,00:14:36,267,18,499,571,426,0,-64.3*,-------,...,Only C3\n,18,499,272,0.0,0.0,-64.3,8416.833667,36893.787575,3086.172345


In [8]:
len(all_data)

22051

In [10]:
# look at July 12th, 2000 -- should have CME at 02:49 + 
foo = all_uncleaned_data[all_uncleaned_data['Date']=='2000/07/10']
foo

Unnamed: 0,Date,Time,Central,Width,Linear,2nd,order,speed,Accel,Mass,...,Remarks,width,linear,mpa,mass,kinetic,accel,Duration_C2,Duration_C3,StartTime_C3_Delta
3230,2000/07/10,02:26:05,285,12,610,597,624,703,5.9*,-------,...,,12,610.0,283,0.0,0.0,5.9,6885.245902,30180.327869,2524.590164
3231,2000/07/10,04:50:05,99,59,623,496,756,923,25.5,1.0e+15,...,Difficult to measure the width\n,59,623.0,99,1000000000000000.0,1.9e+30,25.5,6741.573034,29550.561798,2471.910112
3232,2000/07/10,13:27:29,314,43,327,228,425,718,19.6*,5.5e+14,...,,43,327.0,319,550000000000000.0,2.8999999999999996e+29,19.6,12844.036697,56299.69419,4709.480122
3233,2000/07/10,17:50:05,16,31,616,798,422,0,-30.7,4.3e+14,...,,31,616.0,14,430000000000000.0,8.099999999999999e+29,-30.7,6818.181818,29886.363636,2500.0
3234,2000/07/10,20:26:05,24,72,354,211,492,996,39.4,4.3e+15,...,,72,354.0,24,4300000000000000.0,2.7e+30,39.4,11864.40678,52005.649718,4350.282486
3235,2000/07/10,20:50:05,264,49,426,379,472,472,3.3,1.8e+15,...,,49,426.0,262,1800000000000000.0,1.6e+30,3.3,9859.15493,43215.962441,3615.023474
3236,2000/07/10,21:50:06,67,289,1352,1168,1538,1456,35.0,1.1e+16*,...,Uncertain Width; Partial Halo\n,289,1352.0,94,1.1e+16,1e+32,35.0,3106.508876,13616.863905,1139.053254
3237,2000/07/10,23:50:05,236,18,390,371,409,434,2.1,-------,...,,18,390.0,237,0.0,0.0,2.1,10769.230769,47205.128205,3948.717949


In [10]:
# check for July 12 event in cleaned
foo = all_data[all_data['Date'] == '2000/07/12']
foo

Unnamed: 0,Date,Time,Central,Width,Linear,2nd,order,speed,Accel,Mass,...,Remarks,width,linear,mpa,mass,kinetic,accel,Duration_C2,Duration_C3,StartTime_C3_Delta
3244,2000/07/12,11:06:05,20,144,1124,1285,958,0,-151.1*,-------,...,Very Poor Event; Only 3 points; Partial Halo\n,144,1124,49,0.0,0.0,-151.1,3736.654804,16379.003559,1370.106762


## Write Data To disk

In [20]:
# This is both C2 and C3 detectors together, although they may not both observe the same CME
all_data.to_csv(All_CME_Catalog_Filename, index=True, index_label="CDAW_Id")
len(all_data)

22051

In [19]:
# This is both C2 and C3 detectors together, w/o filtering poor CME events out (needed
# for accurate calculation of CME non-times)
all_uncleaned_data.to_csv(Raw_CME_Catalog_Filename, index=True, index_label="CDAW_Id")
len(all_uncleaned_data)

31628

In [13]:
# Split into detector specific datasets
data = all_data
c2_data = data[data.Remarks.str.contains('Only C2')]
c3_data = data[data.Remarks.str.contains('Only C3')]
c2_c3_data = data[data.Remarks.str.contains('Only C2') == False]
c2_c3_data = c2_c3_data[c2_c3_data.Remarks.str.contains('Only C3') == False]
len(c2_data), len(c3_data), len(c2_c3_data)

(10591, 362, 11098)

In [14]:
c2_c3_data[:3]

Unnamed: 0,Date,Time,Central,Width,Linear,2nd,order,speed,Accel,Mass,...,Remarks,width,linear,mpa,mass,kinetic,accel,Duration_C2,Duration_C3,StartTime_C3_Delta
5,1996/01/31,06:52:13,274,47,158,219,99,0,-12.3*,2.5e+14*,...,Poor Event; Only 3 points\n,47,158,272,250000000000000.0,3.2e+28,-12.3,26582.278481,116518.987342,9746.835443
8,1996/02/12,05:47:26,91,53,160,100,211,236,2.0*,1.3e+15,...,Poor Event\n,53,160,92,1300000000000000.0,1.6e+29,2.0,26250.0,115062.5,9625.0
9,1996/02/17,02:06:31,279,73,317,257,378,532,9.9*,3.3e+14*,...,Very Poor Event\n,73,317,274,330000000000000.0,1.6e+29,9.9,13249.211356,58075.709779,4858.044164


In [15]:
# persist dataset for CME where seen in both detectors
c2_c3_data.to_csv(CME_Catalog_Filename, index=True, index_label="CDAW_Id")

In [16]:
len(c2_c3_data)

11098

In [17]:
#jul_cme = c2_c3_data[c2_c3_data["Date"] > '2000/07/01']
#jul_cme[:10]

In [18]:
#mar_cme = c2_c3_data[c2_c3_data["Date"] > '2000/03/20']
#mar_cme[:3]