In [1]:
import pandas
import datetime
import numpy
import scipy.optimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
import bs4
import urllib
from matplotlib.colors import ListedColormap
from matplotlib.colors import Normalize
from matplotlib import ticker
%matplotlib inline
from matplotlib import rcParams

In [2]:
# setup some cutoff values we'll use in the analysis
velCutoffUpper = 2000.
velCutoffLower = 0.
numPointsCutoffMLTMLAT = 250

In [3]:
velGmagDF = pandas.read_csv("../data/processed-vels-geomag.txt", sep=' ',\
                            parse_dates=["date", "dst_date"], infer_datetime_format=True)
velGmagDF = velGmagDF.drop('Unnamed: 0', axis=1)
# Filter out some values where number of datapoints are pretty low.
countDF = velGmagDF.groupby([ "normMLT", "MLAT" ]).size().reset_index()
countDF.columns = [ "normMLT", "MLAT", "count" ]
# Choose only columns which have atleast 100 points
countDF = countDF[ countDF["count"] >= numPointsCutoffMLTMLAT ].reset_index(drop=True)
# Merge with velsDataDF to filter out unwanted values
velGmagDF = pandas.merge( velGmagDF, countDF,\
                          on=["normMLT", "MLAT"], how='inner' )
velGmagDF.head()

Unnamed: 0,normMLT,MLAT,vSaps,azim,vMagnErr,azimErr,dtStr,tmStr,date,hour,dst_date,dst_index,dst_bin,AE,AL,AO,AU,minute,count
0,-4,60.5,634.32,-10.1,7.19,21.54,20130316,500,2013-03-16 05:00:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",731,-575,-210,156,0,2677
1,-4,60.5,609.64,-10.92,3.58,9.73,20130316,502,2013-03-16 05:02:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",651,-524,-199,127,2,2677
2,-4,60.5,613.71,-12.2,4.15,8.27,20130316,504,2013-03-16 05:04:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",586,-470,-177,116,4,2677
3,-4,60.5,778.5,-16.19,3.8,4.9,20130316,506,2013-03-16 05:06:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",562,-418,-137,144,6,2677
4,-4,60.5,578.11,-15.4,7.43,12.92,20130316,508,2013-03-16 05:08:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",569,-450,-166,119,8,2677


In [4]:
# get dst index vals from wdc kyoto website
# create a list of dates with monthly freq
date_dst_arr = []
dst_val = []
dst_time_del = datetime.timedelta(hours = 1)
start_date = datetime.datetime(2011,1,1)
end_date = datetime.datetime(2014,12,31)
daterange = pandas.date_range(start_date, end_date, freq="M")
for dt in daterange:
#     print "dt--->", dt
    if dt.month <= 9:
            monthStr = "0" + str(dt.month)
    else:
        monthStr = str(dt.month)
    if dt.year >= 2015:
        # create the url
        currUrl = "http://wdc.kugi.kyoto-u.ac.jp/" + "dst_realtime" + \
            "/" + str(dt.year) + monthStr + "/index.html"
    elif ( (dt.year > 2011) and (dt.year < 2015) ):
        # create the url
        currUrl = "http://wdc.kugi.kyoto-u.ac.jp/" + "dst_provisional" + \
            "/" + str(dt.year) + monthStr + "/index.html"
    else:
        # create the url
        currUrl = "http://wdc.kugi.kyoto-u.ac.jp/" + "dst_final" + \
            "/" + str(dt.year) + monthStr + "/index.html"
    conn = urllib.urlopen(currUrl)
    htmlSource = conn.read()
    soup = bs4.BeautifulSoup(htmlSource, 'html.parser')
    dataResObj = soup.find("pre", { "class" : "data" })
    # get the data as a list of strings after removing white space
    lines = dataResObj.text.strip().splitlines()
    for line in lines[6:]:
        columns = line.split()
        if len( columns ) > 0. :
            date_dst_arr.append( datetime.datetime( \
                dt.year, dt.month, int(columns[0]), 1 ) )
            for cols in range( len( columns[1:] ) ) :
                try:
                    inNumberFloatTest = float(columns[cols + 1])
                except:
                    # split these cols as well and work on them!
                    try:
                        missedCols = columns[cols + 1].split("-")[1:]
                        if len(missedCols) >= 1:
                            for mcols in missedCols:
                                dst_val.append( -1*float( mcols ) )
                                # now since we added the date earlier we need to be
                                # careful about appending date values
                                if ( len(date_dst_arr) != len(dst_val) ):
                                    date_dst_arr.append ( date_dst_arr[-1] + dst_time_del )
                    except:
                        print "something wrong with messed up vals!-->", columns[cols + 1]
                        continue
                    continue
                # I have to do this because of the messed up way Kyoto puts up the latest dst value..
                # mixed with 9999 (fillers) like if latest dst is 1 then Kyoto puts it as 199999.....
                if len( columns[ cols + 1 ] ) < 5 :
                    dst_val.append( float( columns[ cols + 1 ] ) )
                elif ( len( columns[ cols + 1 ] ) > 5 and columns[ cols + 1 ][0:3] != '999' ) :
                    mixed_messed_dst = ''
                    for jj in range(5) :
                        if columns[ cols + 1 ][jj] != '9' :
                            mixed_messed_dst = mixed_messed_dst + columns[ cols + 1 ][jj]

                    if mixed_messed_dst != '-' :
                        dst_val.append( float( mixed_messed_dst ) )
                    else :
                        dst_val.append( float( 'nan' ) )
                else :
                    dst_val.append( float( 'nan' ) )
                if cols > 0 :
                    date_dst_arr.append ( date_dst_arr[-1] + dst_time_del )
# convert dst data to a dataframe
dstDF = pandas.DataFrame(
    {'dst_date': date_dst_arr,
     'dst_index': dst_val
    })
# Remove dst values that are greater than 10,
# They pull in data which is not looking good 
# when we check the plots
dstDF = dstDF[ dstDF["dst_index"] <= 10. ].reset_index(drop=True)
dstDF["dtStr"] = dstDF["dst_date"].apply(lambda x: x.strftime('%Y%m%d'))
dstDF["hour"] = dstDF["dst_date"].apply(lambda x: x.strftime('%H'))
dstDF.head()

Unnamed: 0,dst_date,dst_index,dtStr,hour
0,2011-01-01 01:00:00,-11,20110101,1
1,2011-01-01 02:00:00,-11,20110101,2
2,2011-01-01 03:00:00,-9,20110101,3
3,2011-01-01 04:00:00,-5,20110101,4
4,2011-01-01 05:00:00,-3,20110101,5


In [11]:
# Get season data
def get_season(row):
        currMonth = row["dst_date"].month
        if ( (currMonth >= 11) or (currMonth <= 2) ):
            return -1#"winter"
        elif ( (currMonth >= 5) and (currMonth <= 8) ):
            return 1#"summer"
        else:
            return 0#"equinox"
#         if row[timeColName] < 10:
#             currTimeStr = "000" + str( int( row[timeColName] ) )
#         elif row[timeColName] < 100:
#             currTimeStr = "00" + str( int( row[timeColName] ) )
#         elif row[timeColName] < 1000:
#             currTimeStr = "0" + str( int( row[timeColName] ) )
#         else:
#             currTimeStr = str( int( row[timeColName] ) )
#         return datetime.datetime.strptime( currDateStr\
#                         + ":" + currTimeStr, "%Y%m%d:%H%M" )
dstDF["season"] = dstDF.apply( get_season, axis=1 )
print dstDF.head()

             dst_date dst_index     dtStr hour  season   dst_index
0 2011-01-01 01:00:00       -11  20110101   01      -1  (-25, -10]
1 2011-01-01 02:00:00       -11  20110101   02      -1  (-25, -10]
2 2011-01-01 03:00:00        -9  20110101   03      -1   (-10, 10]
3 2011-01-01 04:00:00        -5  20110101   04      -1   (-10, 10]
4 2011-01-01 05:00:00        -3  20110101   05      -1   (-10, 10]


In [10]:
dstBins = [ -150, -75, -50, -25, -10, 10 ]
dstDF = pandas.concat( [ dstDF, \
                    pandas.cut( dstDF["dst_index"], \
                               bins=dstBins ) ], axis=1 )
dstDF.columns = [ "dst_date", "dst_index", "dtStr", "hour", "season", "dst_bin" ]
dstDF.head()

TypeError: cannot concatenate a non-NDFrame object