In [8]:
#working on parsing the data using pandas.read_csv()

import pandas as pd
import glob


def parseRtergHtml(html):
    colNames1 = ["Eventname", "oDate", "oTime", "Lat.", "Long.", "Depth", "Me", "Txo",
                 "Ehf", "Ebb", "Mehf", "Ehf/Tr^3", "Nstats", "colon", "SRC", "iMag"]
    df1 = pd.read_csv(html, names = colNames1, skiprows = 1, nrows = 1, delim_whitespace=True)
    del df1["colon"]  #remove colon column
    df1["SRC"] = df1.iloc[0]["SRC"][5:7]
    iMagTypeVal = df1.iloc[0]["iMag"].split("=")[0]
    df1.insert(14, "iMagType", iMagTypeVal)
    df1["iMag"] = df1.iloc[0]["iMag"].split("=")[1][:-2]
    oTimeVal = df1["oDate"] + " " + df1["oTime"]
    del df1["oDate"]
    df1["oTime"] = pd.to_datetime(oTimeVal, utc=True)
    
    colNames2 = ["TACER_HF", "TACER_BB"]
    try:
        df2 = pd.read_csv(html, names = colNames2, skiprows = 2, nrows = 1, delim_whitespace=True, usecols=[7,10])
        intSkip = 0
        intSkipCol = 0
    except:
        df2 = pd.DataFrame(columns = colNames2)
        intSkip = 2
        intSkipCol = 1
    
    colNames3 = ["junk", "Comment"]
    colNames4 = ["mTime"]
    colNames5 = ["junk", "iteration"]
    skip = 8
    skipCol = 3
    
    df3 = pd.read_csv(html, names = colNames3, skiprows = skipCol - intSkipCol, nrows = 1, delimiter=":")
    del df3["junk"]
    
    df4 = pd.read_csv(html, names = colNames4, skiprows = skip - intSkip - 1, nrows = 1, delimiter="?")
    df4["mTime"] = pd.to_datetime(df4["mTime"])
    
    df5 = pd.read_csv(html, names = colNames5, skiprows = skip - intSkip, nrows = 1, delimiter="=")
    del df5["junk"]
    df5["iteration"][0] = df5.iloc[0]["iteration"].split("<")[0]

    df = pd.concat([df1,df2,df3,df4, df5], axis=1)
    return df

In [9]:
def builddf(htmlfiles):
    df= pd.DataFrame()
    for html in htmlfiles:
        df1=parseRtergHtml(html)
        if len(df) == 0:  # first run keeps header
            df = df1
        else:  # otherwise strip it
            df=df.append(df1, ignore_index = True)             
    return df

htmlfiles = ["rterg_html_outs/21012303.html", "rterg_html_outs/21021000.html",
             "rterg_html_outs/18101000.html", "rterg_html_outs/16063000.html"]
htmlfiles = sorted(glob.glob('rterg_html_outs/[0-9]???????.html'))  # creates a sorted version
print(htmlfiles)

df = builddf(htmlfiles)
df.head()

['rterg_html_outs/16063000.html', 'rterg_html_outs/18101000.html', 'rterg_html_outs/21012303.html', 'rterg_html_outs/21021000.html']


Unnamed: 0,Eventname,oTime,Lat.,Long.,Depth,Me,Txo,Ehf,Ebb,Mehf,Ehf/Tr^3,Nstats,SRC,iMagType,iMag,TACER_HF,TACER_BB,Comment,mTime,iteration
0,16063000,2016-06-30 11:30:34+00:00,-16.09,167.42,39.0,5.9,52,10000000000000.0,16000000000000.0,6.25,75000000.0,81,PT,M12,5.9,,,,2016-10-21 19:54:42+00:00,A
1,18101000,2018-10-10 20:48:20+00:00,-5.9,151.4,33.0,6.82,86,120000000000000.0,380000000000000.0,6.94,180000000.0,76,AT,Mi,7.3,,,,2018-10-10 23:53:30+00:00,A
2,21012303,2021-01-23 23:36:55+00:00,-62.0,-55.3,10.0,7.01,53,97000000000000.0,730000000000000.0,6.89,650000000.0,32,AT,M,7.3,27.0,26.0,this is our comment text,2021-01-28 14:25:57+00:00,A
3,21021000,2021-02-10 08:22:31+00:00,-13.65,-111.63,10.0,5.37,36,54000000000.0,2500000000000.0,4.72,1200000.0,4,US,M,5.6,26.0,103.0,"Pacific-Nazca Plate, many missed stations",2021-02-16 19:02:28+00:00,A


In [11]:
print(df.index)

RangeIndex(start=0, stop=4, step=1)


In [2]:
# takes in a dataframe and parses it into xml format

import xml.etree.ElementTree as et

def toXML(df):
    
    name = str(df[df.columns[0]][0]) + ".xml"
    
    root = et.Element("root")
    tree = et.ElementTree(root)
    event = et.Element("event")
    root.append(event)
    
    for x in range(20):
        data = str(df[df.columns[x]][0])
        type = et.SubElement(event, df.columns[x]).text = data
    
    file = et.ElementTree(root)
    file.write(name, xml_declaration=True)
    return

toXML(df)

# don't run below here

In [1]:
#builds and empty dataframe

# old testing, do not run.

import pandas as pd

def buildDF():
    colNames = ["#Eventname", "Date", "Otime", "Lat.", "Long.", "Depth", "Me", "Tr", "Ehf", "Ebb",
                "Mehf", "Ehf/Tr^3", "Nstats : Comments", "TACER_HF", "TACER_BB", "COMMENTS"]
    df = pd.DataFrame(columns = colNames)
    return df

#uses a helper function to input data into the dataframe

def parseRtergHtml(df, html):
    data = readHtml(html)
    for x in range(16):
        df[df.columns[x]] = [data[x]]
    return

#creates a list of data from the html file that will be
#inputted into the dataframe

def readHtml(html):
    file = open(html, "r")
    file.readline()
    line2 = file.readline()
    dataList = line2.split()
    str = ""
    for x in range(3):
        str = str + " " + dataList.pop(13)
    dataList[12] = dataList[12] + str
    line3 = file.readline()
    tempList = line3.split()
    dataList.append(tempList[7])
    dataList.append(tempList[10])
    line4 = file.readline()
    dataList.append(line4[15:].strip())
    return dataList