In [1]:
import pandas
import datetime
import numpy
import scipy.optimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
import bs4
import urllib
from matplotlib.colors import ListedColormap
from matplotlib.colors import Normalize
from matplotlib import ticker
%matplotlib inline
from matplotlib import rcParams

In [7]:
# get dst index vals from wdc kyoto website
# create a list of dates with monthly freq
date_dst_arr = []
dst_val = []
dst_time_del = datetime.timedelta(hours = 1)
start_date = datetime.datetime(2011,1,1)
end_date = datetime.datetime(2014,12,31)
daterange = pandas.date_range(start_date, end_date, freq="M")
for dt in daterange:
#     print "dt--->", dt
    if dt.month <= 9:
            monthStr = "0" + str(dt.month)
    else:
        monthStr = str(dt.month)
    if dt.year >= 2015:
        # create the url
        currUrl = "http://wdc.kugi.kyoto-u.ac.jp/" + "dst_realtime" + \
            "/" + str(dt.year) + monthStr + "/index.html"
    elif ( (dt.year > 2011) and (dt.year < 2015) ):
        # create the url
        currUrl = "http://wdc.kugi.kyoto-u.ac.jp/" + "dst_provisional" + \
            "/" + str(dt.year) + monthStr + "/index.html"
    else:
        # create the url
        currUrl = "http://wdc.kugi.kyoto-u.ac.jp/" + "dst_final" + \
            "/" + str(dt.year) + monthStr + "/index.html"
    conn = urllib.urlopen(currUrl)
    htmlSource = conn.read()
    soup = bs4.BeautifulSoup(htmlSource, 'html.parser')
    dataResObj = soup.find("pre", { "class" : "data" })
    # get the data as a list of strings after removing white space
    lines = dataResObj.text.strip().splitlines()
    for line in lines[6:]:
        columns = line.split()
        if len( columns ) > 0. :
            date_dst_arr.append( datetime.datetime( \
                dt.year, dt.month, int(columns[0]), 1 ) )
            for cols in range( len( columns[1:] ) ) :
                try:
                    inNumberFloatTest = float(columns[cols + 1])
                except:
                    # split these cols as well and work on them!
                    try:
                        missedCols = columns[cols + 1].split("-")[1:]
                        if len(missedCols) >= 1:
                            for mcols in missedCols:
                                dst_val.append( -1*float( mcols ) )
                                # now since we added the date earlier we need to be
                                # careful about appending date values
                                if ( len(date_dst_arr) != len(dst_val) ):
                                    date_dst_arr.append ( date_dst_arr[-1] + dst_time_del )
                    except:
                        print "something wrong with messed up vals!-->", columns[cols + 1]
                        continue
                    continue
                # I have to do this because of the messed up way Kyoto puts up the latest dst value..
                # mixed with 9999 (fillers) like if latest dst is 1 then Kyoto puts it as 199999.....
                if len( columns[ cols + 1 ] ) < 5 :
                    dst_val.append( float( columns[ cols + 1 ] ) )
                elif ( len( columns[ cols + 1 ] ) > 5 and columns[ cols + 1 ][0:3] != '999' ) :
                    mixed_messed_dst = ''
                    for jj in range(5) :
                        if columns[ cols + 1 ][jj] != '9' :
                            mixed_messed_dst = mixed_messed_dst + columns[ cols + 1 ][jj]

                    if mixed_messed_dst != '-' :
                        dst_val.append( float( mixed_messed_dst ) )
                    else :
                        dst_val.append( float( 'nan' ) )
                else :
                    dst_val.append( float( 'nan' ) )
                if cols > 0 :
                    date_dst_arr.append ( date_dst_arr[-1] + dst_time_del )
# convert dst data to a dataframe
dstDF = pandas.DataFrame(
    {'dst_date': date_dst_arr,
     'dst_index': dst_val
    })
# Remove dst values that are greater than 10,
# They pull in data which is not looking good 
# when we check the plots
dstDF = dstDF[ dstDF["dst_index"] <= 10. ].reset_index(drop=True)
dstDF["dtStr"] = dstDF["dst_date"].apply(lambda x: x.strftime('%Y%m%d'))
dstDF["hour"] = dstDF["dst_date"].apply(lambda x: x.strftime('%H'))
dstDF.head()

Unnamed: 0,dst_date,dst_index,dtStr,hour
0,2011-01-01 01:00:00,-11.0,20110101,1
1,2011-01-01 02:00:00,-11.0,20110101,2
2,2011-01-01 03:00:00,-9.0,20110101,3
3,2011-01-01 04:00:00,-5.0,20110101,4
4,2011-01-01 05:00:00,-3.0,20110101,5


In [3]:
def convert_to_datetime(row):
        # Given a datestr and a time string convert to a python datetime obj.
        import datetime
        datecolName="dtStr"
        timeColName="tmStr"
        currDateStr = str( int( row[datecolName] ) )
    #     return currDateStr
        if row[timeColName] < 10:
            currTimeStr = "000" + str( int( row[timeColName] ) )
        elif row[timeColName] < 100:
            currTimeStr = "00" + str( int( row[timeColName] ) )
        elif row[timeColName] < 1000:
            currTimeStr = "0" + str( int( row[timeColName] ) )
        else:
            currTimeStr = str( int( row[timeColName] ) )
        return datetime.datetime.strptime( currDateStr\
                        + ":" + currTimeStr, "%Y%m%d:%H%M" )

fitVelFile = "../data/all_fitted_vels.txt"
inpColNames = ["normMLT", "MLAT", "vSaps", "azim",\
                     "vMagnErr", "azimErr", "dtStr", "tmStr"]
velsDataDF = pandas.read_csv(fitVelFile, sep=' ',\
                             header=None, names=inpColNames)
velsDataDF["date"] = velsDataDF.apply( convert_to_datetime, axis=1 )
velsDataDF["dtStr"] = velsDataDF["dtStr"].astype("str")

In [10]:
velsDataDF["hour"] = velsDataDF["date"].apply(lambda x: x.strftime('%H'))
velsDataDF["dt-time-str"] = velsDataDF["dtStr"].astype(str) +\
                                "-" + velsDataDF["hour"].astype(str)
# Now merge the dst and velocity DFs
velsDataDF = pandas.merge( velsDataDF, dstDF,\
                          on=["dtStr", "hour"], how='inner' )
dstBins = [ -150, -75, -50, -25, -10, 10 ]
velsDataDF = pandas.concat( [ velsDataDF, \
                    pandas.cut( velsDataDF["dst_index"], \
                               bins=dstBins ) ], axis=1 )
velsDataDF.columns = [ "normMLT", "MLAT", "vSaps", "azim",\
                         "vMagnErr", "azimErr", "dtStr", "tmStr",\
                         "date", "hour", "dt-time-str", "dst_date",\
                         "dst_index", "dst_bin"]
velsDataDF.head()

Unnamed: 0,normMLT,MLAT,vSaps,azim,vMagnErr,azimErr,dtStr,tmStr,date,hour,dt-time-str,dst_date_x,dst_index_x,dst_index_x.1,dst_date_y,dst_index_y,dst_date,dst_index,dst_index.1
0,-4.0,60.5,634.32,-10.1,7.19,21.54,20130316,500,2013-03-16 05:00:00,5,20130316-05,2013-03-16 05:00:00,-19.0,"(-25, -10]",2013-03-16 05:00:00,-19.0,2013-03-16 05:00:00,-19.0,"(-25, -10]"
1,-3.0,60.5,659.19,-14.6,7.18,18.52,20130316,500,2013-03-16 05:00:00,5,20130316-05,2013-03-16 05:00:00,-19.0,"(-25, -10]",2013-03-16 05:00:00,-19.0,2013-03-16 05:00:00,-19.0,"(-25, -10]"
2,-2.0,60.5,659.19,-14.6,7.18,18.52,20130316,500,2013-03-16 05:00:00,5,20130316-05,2013-03-16 05:00:00,-19.0,"(-25, -10]",2013-03-16 05:00:00,-19.0,2013-03-16 05:00:00,-19.0,"(-25, -10]"
3,-4.0,61.0,1202.45,-15.47,6.97,12.2,20130316,500,2013-03-16 05:00:00,5,20130316-05,2013-03-16 05:00:00,-19.0,"(-25, -10]",2013-03-16 05:00:00,-19.0,2013-03-16 05:00:00,-19.0,"(-25, -10]"
4,-3.0,61.0,799.26,-13.31,8.43,17.93,20130316,500,2013-03-16 05:00:00,5,20130316-05,2013-03-16 05:00:00,-19.0,"(-25, -10]",2013-03-16 05:00:00,-19.0,2013-03-16 05:00:00,-19.0,"(-25, -10]"


In [None]:
testDF = velsDataDF.groupby( ["dst_bin"] )\
                    ["dtStr-hour"].aggregate(lambda x: len(list(set(x)))).reset_index()
testDF.head()