In [1]:
import pandas
import datetime
import numpy
import scipy.optimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
import bs4
import urllib
from matplotlib.colors import ListedColormap
from matplotlib.colors import Normalize
from matplotlib import ticker
%matplotlib inline

In [2]:
# setup some cutoff values we'll use in the analysis
velCutoffUpper = 2500.
velCutoffLower = 0.
numPointsCutoffMLTMLAT = 25
perCutoffMLTMLAT = 0.15
mlatCutOffUpper = 65.
mlatCutOffLower = 53.

In [3]:
# READ Dst and AE data
inpDstFile = "../data/dst_out_file.csv"
dstDF = pandas.read_csv(inpDstFile, sep=' ',\
                infer_datetime_format=True,\
                        parse_dates=["dst_date"])
dstDF = dstDF[ (dstDF["dst_date"] > datetime.datetime(2010,12,31)) &\
             (dstDF["dst_date"] < datetime.datetime(2015,1,1))].reset_index(drop=True)
dstDF = dstDF[ dstDF["dst_index"] <= 10. ].reset_index(drop=True)
dstDF["dtStr"] = dstDF["dst_date"].apply(lambda x: x.strftime('%Y%m%d'))
dstDF["hour"] = dstDF["dst_date"].apply(lambda x: x.strftime('%H'))
# Aur Inds
aurDF = pandas.read_csv( "../data/aur_processed.txt", sep=' ' )
aurDF["date"] = pandas.to_datetime(aurDF["datetimeStr"], format='%Y%m%d-%H-%M')
aurDF["hour"] = aurDF["date"].apply(lambda x: x.strftime('%H'))
aurDF["minute"] = aurDF["date"].apply(lambda x: x.strftime('%M'))
aurDF["dtStr"] = aurDF["date"].apply(lambda x: x.strftime('%Y%m%d'))

In [4]:
#### In this block we load Velocity data ####
#### In this block we load Velocity data ####
#### In this block we load Velocity data ####
# a helper function to convert seperate date
# and time strings to datetime objects  
fitVelFile = "../data/fitres-fin.csv"
inpColNames = ["azim", "azimStd", "delMLT", "endPtMLAT",\
               "endPtNormMLT","goodFit", "MLAT", "normMLT",\
               "vSaps", "velSTD", "date"]
if ( ("extra" in fitVelFile) or ("fin" in fitVelFile) ):
    inpColNames = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date"]
# velsDataDF = pandas.read_csv(fitVelFile, sep=' ', header=None)
# velsDataDF.columns = inpColNames
velsDataDF = pandas.read_csv(fitVelFile, sep=' ',\
                             header=None, names=inpColNames,\
                            infer_datetime_format=True,\
                            parse_dates=["date"])

velsDataDF["dtStr"] = velsDataDF["date"].apply(lambda x: x.strftime('%Y%m%d'))
# Discard unwanted values
# We'll only consider those velocities 
# which lie between 0 and 2500 m/s
# and located below 70 MLAT
velsDataDF = velsDataDF[ (velsDataDF["vSaps"] > velCutoffLower) \
                        & (velsDataDF["vSaps"] < velCutoffUpper)\
                       ].reset_index(drop=True)
velsDataDF = velsDataDF[ (velsDataDF["MLAT"] < mlatCutOffUpper) &\
                       (velsDataDF["MLAT"] > mlatCutOffLower) ].reset_index(drop=True)
velsDataDF["hour"] = velsDataDF["date"].apply(lambda x: x.strftime('%H'))
velsDataDF["minute"] = velsDataDF["date"].apply(lambda x: x.strftime('%M'))
# Now merge the dst and velocity DFs
velsDataDF = pandas.merge( velsDataDF, dstDF,\
                          on=["dtStr", "hour"], how='inner' )
# We generally work with Dst bins, set them up
# add dst_bins
dstBins = [ -150, -75, -50, -25, -10, 10 ]
velsDataDF = pandas.concat( [ velsDataDF, \
                    pandas.cut( velsDataDF["dst_index"], \
                               bins=dstBins ) ], axis=1 )
velsDataDF.columns = ['azim', 'azimStd', 'delMLT', 'endPtMLAT', 'endPtNormMLT',\
                      'goodFit', 'MLAT', 'normMLT', 'vSaps', 'velSTD', 'date',\
                      'dtStr', 'hour', 'minute', 'dst_date', 'dst_index', 'dst_bin']
if ( ("extra" in fitVelFile) or ("fin" in fitVelFile) ):
    velsDataDF.columns = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date",\
                'dtStr', 'hour', 'minute', 'dst_date',\
                'dst_index', 'dst_bin']
# Also merge with aurDF
# print "pre merge shape-->", velsDataDF.shape
velsDataDF = pandas.merge( velsDataDF, aurDF,\
                         on=["dtStr", "hour", "minute"], how='inner')
# Discard some unwanted cols
selColsVels = ['azim', 'azimStd', 'delMLT', 'endPtMLAT', 'endPtNormMLT',\
               'goodFit', 'MLAT', 'normMLT', 'vSaps', 'velSTD', 'date_x',\
               'dtStr', 'hour', 'minute', 'dst_date', 'dst_index', 'dst_bin',\
               'datetimeStr', 'AE', 'AL', 'AO', 'AU']
if ( ("extra" in fitVelFile) or ("fin" in fitVelFile) ):
    selColsVels = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date_x",\
                'dtStr', 'hour', 'minute', 'dst_date',\
                'dst_index', 'dst_bin', 'datetimeStr', 'AE', 'AL', 'AO', 'AU']
velsDataDF = velsDataDF[ selColsVels ]
velsDataDF.columns = ['azim', 'azimStd', 'delMLT', 'endPtMLAT', 'endPtNormMLT',\
               'goodFit', 'MLAT', 'normMLT', 'vSaps', 'velSTD', 'date',\
               'dtStr', 'hour', 'minute', 'dst_date', 'dst_index', 'dst_bin',\
               'datetimeStr', 'AE', 'AL', 'AO', 'AU']
if ( ("extra" in fitVelFile) or ("fin" in fitVelFile) ):
    velsDataDF.columns = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date",\
                'dtStr', 'hour', 'minute', 'dst_date',\
                'dst_index', 'dst_bin', 'datetimeStr', 'AE', 'AL', 'AO', 'AU']

#### In this block we load Velocity data ####
#### In this block we load Velocity data ####
#### In this block we load Velocity data ####

In [5]:
# Filter out some values where number of percent datapoints are pretty low.
# We'll not use a number but divide data into different Dst groups and
# discard locations where number of points are relatively low.
# Get max points at a given Lat, MLT, DstBin
################## NEW METHOD ##################
################## NEW METHOD ##################
################## NEW METHOD ##################
dstSapsMLTLatCountDF = velsDataDF.groupby(["dst_bin", "normMLT", "MLAT"]).size().reset_index()
maxCntMLTLatDst = dstSapsMLTLatCountDF.groupby(["dst_bin"]).max().reset_index()
maxCntMLTLatDst = maxCntMLTLatDst.drop(["normMLT", "MLAT"], 1)
maxCntMLTLatDst.columns = ["dst_bin", "maxCount"]
dstSapsMLTLatCountDF = pandas.merge( dstSapsMLTLatCountDF, maxCntMLTLatDst, \
                              on=["dst_bin"], how='inner')
dstSapsMLTLatCountDF.columns = ["dst_bin", "normMLT", "MLAT", "count", "maxCount"]
dstSapsMLTLatCountDF["probOcc"] = dstSapsMLTLatCountDF["count"]/dstSapsMLTLatCountDF["maxCount"]
# Filter out unwanted values
dstSapsMLTLatCountDF = dstSapsMLTLatCountDF[ \
                        dstSapsMLTLatCountDF["probOcc"] >= perCutoffMLTMLAT\
                        ].reset_index(drop=True)
velsDataDF = pandas.merge( velsDataDF, dstSapsMLTLatCountDF,\
                          on=["dst_bin", "normMLT", "MLAT"], how='inner' )
velsDataDF.to_csv("../data/processed-vels-geomag-fin.txt", sep=' ', index=False)
################## NEW METHOD ##################
################## NEW METHOD ##################
################## NEW METHOD ##################

################## OLD METHOD ##################
################## OLD METHOD ##################
################## OLD METHOD ##################
# countDF = velsDataDF.groupby([ "normMLT", "MLAT" ]).size().reset_index()
# countDF.columns = [ "normMLT", "MLAT", "count" ]
# # Choose only columns which have atleast 100 points
# countDF = countDF[ countDF["count"] >= numPointsCutoffMLTMLAT ].reset_index(drop=True)
# # Merge with velsDataDF to filter out unwanted values
# velsDataDF = pandas.merge( velsDataDF, countDF,\
#                           on=["normMLT", "MLAT"], how='inner' )
# velsDataDF.to_csv("../data/processed-vels-geomag-extra.txt", sep=' ', index=False)
# print velsDataDF.columns.tolist()
################## OLD METHOD ##################
################## OLD METHOD ##################
################## OLD METHOD ##################
velsDataDF.head()

Unnamed: 0,azim,azimStd,delMLT,goodFit,MLAT,normMLT,vSaps,velSTD,endPtMLAT,endPtNormMLT,...,dst_index,dst_bin,datetimeStr,AE,AL,AO,AU,count,maxCount,probOcc
0,-10.587427,1.794202,0.5,True,62.0,-6.0,306.738497,11.198685,62.06,-6.3,...,-44.0,"(-50, -25]",20121014-01-00,500,-343,-93,157,470,2562,0.18345
1,-13.524308,1.486233,1.0,True,62.0,-6.0,301.958739,8.617099,62.07,-6.29,...,-44.0,"(-50, -25]",20121014-01-02,490,-341,-96,149,470,2562,0.18345
2,-11.704231,1.7711,0.5,True,62.0,-6.0,328.832509,13.365574,62.07,-6.32,...,-44.0,"(-50, -25]",20121014-01-04,527,-371,-108,156,470,2562,0.18345
3,-11.402638,1.525368,1.0,True,62.0,-6.0,357.951508,12.264929,62.07,-6.35,...,-44.0,"(-50, -25]",20121014-01-06,515,-366,-109,149,470,2562,0.18345
4,-11.586116,1.070948,1.0,True,62.0,-6.0,406.549966,10.616775,62.08,-6.4,...,-44.0,"(-50, -25]",20121014-01-08,527,-373,-110,154,470,2562,0.18345


In [25]:
velDatesDF = velsDataDF[ (velsDataDF["vSaps"] < 150.) &\
                       (velsDataDF["dtStr"] == "20110723")].groupby("datetimeStr").size()
print velDatesDF.sort_values(ascending=False).head(20)

datetimeStr
20110723-07-26    45
20110723-07-24    45
20110723-07-22    36
20110723-07-28    33
20110723-07-00    30
20110723-06-58    27
20110723-09-24    21
20110723-06-54    21
20110723-09-12    18
20110723-09-22    18
20110723-07-18    18
20110723-07-32    18
20110723-07-34    18
20110723-07-36    15
20110723-07-38    15
20110723-07-30    15
20110723-09-26    15
20110723-08-58    15
20110723-06-48    15
20110723-07-04    12
dtype: int64


In [28]:
print velsDataDF[ (velsDataDF["datetimeStr"] == "20110723-07-26") &\
                (velsDataDF["vSaps"] < 150.) ][ ["date", "vSaps", "delMLT", "azim", "MLAT", "normMLT"] ]

                      date       vSaps  delMLT       azim  MLAT  normMLT
97636  2011-07-23 07:26:00  110.717489     1.5  -3.426777  56.0     -2.0
97637  2011-07-23 07:26:00  110.717489     1.5  -3.426777  56.0     -2.0
97638  2011-07-23 07:26:00  110.717489     1.5  -3.426777  56.0     -2.0
99295  2011-07-23 07:26:00  121.606422     1.5  -5.254620  56.5     -2.0
99296  2011-07-23 07:26:00  121.606422     1.5  -5.254620  56.5     -2.0
99297  2011-07-23 07:26:00  121.606422     1.5  -5.254620  56.5     -2.0
99720  2011-07-23 07:26:00  148.858479     NaN  -5.254620  57.5     -2.0
99721  2011-07-23 07:26:00  148.858479     NaN  -5.254620  57.5     -2.0
99722  2011-07-23 07:26:00  148.858479     NaN  -5.254620  57.5     -2.0
101028 2011-07-23 07:26:00  140.400809     NaN  -5.254620  57.0     -2.0
101029 2011-07-23 07:26:00  140.400809     NaN  -5.254620  57.0     -2.0
101030 2011-07-23 07:26:00  140.400809     NaN  -5.254620  57.0     -2.0
101548 2011-07-23 07:26:00  136.977370     NaN  -5.

In [11]:
print velsDataDF[ (velsDataDF["datetimeStr"] == "20110920-06-24") &\
                (velsDataDF["vSaps"] < 100.) ][ ["date", "vSaps", "delMLT", "azim", "MLAT", "normMLT"] ]

                      date      vSaps  delMLT       azim  MLAT  normMLT
98250  2011-09-20 06:24:00  95.297161     NaN -10.698310  59.5     -3.0
98251  2011-09-20 06:24:00  95.297161     NaN -10.698310  59.5     -3.0
98252  2011-09-20 06:24:00  95.297161     NaN -10.698310  59.5     -3.0
100245 2011-09-20 06:24:00  91.034109     NaN -10.698310  59.0     -3.0
100246 2011-09-20 06:24:00  91.034109     NaN -10.698310  59.0     -3.0
100247 2011-09-20 06:24:00  91.034109     NaN -10.698310  59.0     -3.0
178975 2011-09-20 06:24:00  99.553560     0.5  -6.953721  60.0     -1.0
178976 2011-09-20 06:24:00  99.553560     0.5  -6.953721  60.0     -1.0
178977 2011-09-20 06:24:00  99.553560     0.5  -6.953721  60.0     -1.0
182058 2011-09-20 06:24:00  92.427215     0.5  -9.081323  59.5     -1.0
182059 2011-09-20 06:24:00  92.427215     0.5  -9.081323  59.5     -1.0
182060 2011-09-20 06:24:00  92.427215     0.5  -9.081323  59.5     -1.0
183563 2011-09-20 06:24:00  87.486186     0.5  -7.261328  59.0  