In [1]:
import pandas
import datetime
import numpy
import scipy.optimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
import bs4
import urllib
from matplotlib.colors import ListedColormap
from matplotlib.colors import Normalize
from matplotlib import ticker
%matplotlib inline

In [2]:
# setup some cutoff values we'll use in the analysis
velCutoffUpper = 2000.
velCutoffLower = 0.
numPointsCutoffMLTMLAT = 25
perCutoffMLTMLAT = 0.15
mlatCutOffUpper = 65.
mlatCutOffLower = 53.

In [3]:
# READ Dst and AE data
inpDstFile = "../data/dst_out_file.csv"
dstDF = pandas.read_csv(inpDstFile, sep=' ',\
                infer_datetime_format=True,\
                        parse_dates=["dst_date"])
dstDF = dstDF[ (dstDF["dst_date"] > datetime.datetime(2010,12,31)) &\
             (dstDF["dst_date"] < datetime.datetime(2015,1,1))].reset_index(drop=True)
dstDF = dstDF[ dstDF["dst_index"] <= 10. ].reset_index(drop=True)
dstDF["dtStr"] = dstDF["dst_date"].apply(lambda x: x.strftime('%Y%m%d'))
dstDF["hour"] = dstDF["dst_date"].apply(lambda x: x.strftime('%H'))
# Aur Inds
aurDF = pandas.read_csv( "../data/aur_processed.txt", sep=' ' )
aurDF["date"] = pandas.to_datetime(aurDF["datetimeStr"], format='%Y%m%d-%H-%M')
aurDF["hour"] = aurDF["date"].apply(lambda x: x.strftime('%H'))
aurDF["minute"] = aurDF["date"].apply(lambda x: x.strftime('%M'))
aurDF["dtStr"] = aurDF["date"].apply(lambda x: x.strftime('%Y%m%d'))

In [4]:
#### In this block we load Velocity data ####
#### In this block we load Velocity data ####
#### In this block we load Velocity data ####
# a helper function to convert seperate date
# and time strings to datetime objects  
fitVelFile = "../data/fitres-extra.csv"
inpColNames = ["azim", "azimStd", "delMLT", "endPtMLAT",\
               "endPtNormMLT","goodFit", "MLAT", "normMLT",\
               "vSaps", "velSTD", "date"]
if "extra" in fitVelFile:
    inpColNames = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date"]
# velsDataDF = pandas.read_csv(fitVelFile, sep=' ', header=None)
# velsDataDF.columns = inpColNames
velsDataDF = pandas.read_csv(fitVelFile, sep=' ',\
                             header=None, names=inpColNames,\
                            infer_datetime_format=True,\
                            parse_dates=["date"])

velsDataDF["dtStr"] = velsDataDF["date"].apply(lambda x: x.strftime('%Y%m%d'))
# Discard unwanted values
# We'll only consider those velocities 
# which lie between 0 and 2500 m/s
# and located below 70 MLAT
velsDataDF = velsDataDF[ (velsDataDF["vSaps"] > velCutoffLower) \
                        & (velsDataDF["vSaps"] < velCutoffUpper)\
                       ].reset_index(drop=True)
velsDataDF = velsDataDF[ (velsDataDF["MLAT"] < mlatCutOffUpper) &\
                       (velsDataDF["MLAT"] > mlatCutOffLower) ].reset_index(drop=True)
velsDataDF["hour"] = velsDataDF["date"].apply(lambda x: x.strftime('%H'))
velsDataDF["minute"] = velsDataDF["date"].apply(lambda x: x.strftime('%M'))
# Now merge the dst and velocity DFs
velsDataDF = pandas.merge( velsDataDF, dstDF,\
                          on=["dtStr", "hour"], how='inner' )
# We generally work with Dst bins, set them up
# add dst_bins
dstBins = [ -150, -75, -50, -25, -10, 10 ]
velsDataDF = pandas.concat( [ velsDataDF, \
                    pandas.cut( velsDataDF["dst_index"], \
                               bins=dstBins ) ], axis=1 )
velsDataDF.columns = ['azim', 'azimStd', 'delMLT', 'endPtMLAT', 'endPtNormMLT',\
                      'goodFit', 'MLAT', 'normMLT', 'vSaps', 'velSTD', 'date',\
                      'dtStr', 'hour', 'minute', 'dst_date', 'dst_index', 'dst_bin']
if "extra" in fitVelFile:
    velsDataDF.columns = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date",\
                'dtStr', 'hour', 'minute', 'dst_date',\
                'dst_index', 'dst_bin']
# Also merge with aurDF
# print "pre merge shape-->", velsDataDF.shape
velsDataDF = pandas.merge( velsDataDF, aurDF,\
                         on=["dtStr", "hour", "minute"], how='inner')
# Discard some unwanted cols
selColsVels = ['azim', 'azimStd', 'delMLT', 'endPtMLAT', 'endPtNormMLT',\
               'goodFit', 'MLAT', 'normMLT', 'vSaps', 'velSTD', 'date_x',\
               'dtStr', 'hour', 'minute', 'dst_date', 'dst_index', 'dst_bin',\
               'datetimeStr', 'AE', 'AL', 'AO', 'AU']
if "extra" in fitVelFile:
    selColsVels = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date_x",\
                'dtStr', 'hour', 'minute', 'dst_date',\
                'dst_index', 'dst_bin', 'datetimeStr', 'AE', 'AL', 'AO', 'AU']
velsDataDF = velsDataDF[ selColsVels ]
velsDataDF.columns = ['azim', 'azimStd', 'delMLT', 'endPtMLAT', 'endPtNormMLT',\
               'goodFit', 'MLAT', 'normMLT', 'vSaps', 'velSTD', 'date',\
               'dtStr', 'hour', 'minute', 'dst_date', 'dst_index', 'dst_bin',\
               'datetimeStr', 'AE', 'AL', 'AO', 'AU']
if "extra" in fitVelFile:
    velsDataDF.columns = ["azim", "azimStd", "delMLT", "goodFit",\
               "MLAT", "normMLT", "vSaps", "velSTD",\
               "endPtMLAT", "endPtNormMLT", "date",\
                'dtStr', 'hour', 'minute', 'dst_date',\
                'dst_index', 'dst_bin', 'datetimeStr', 'AE', 'AL', 'AO', 'AU']

#### In this block we load Velocity data ####
#### In this block we load Velocity data ####
#### In this block we load Velocity data ####

In [5]:
# Filter out some values where number of percent datapoints are pretty low.
# We'll not use a number but divide data into different Dst groups and
# discard locations where number of points are relatively low.
# Get max points at a given Lat, MLT, DstBin
################## NEW METHOD ##################
################## NEW METHOD ##################
################## NEW METHOD ##################
dstSapsMLTLatCountDF = velsDataDF.groupby(["dst_bin", "normMLT", "MLAT"]).size().reset_index()
maxCntMLTLatDst = dstSapsMLTLatCountDF.groupby(["dst_bin"]).max().reset_index()
maxCntMLTLatDst = maxCntMLTLatDst.drop(["normMLT", "MLAT"], 1)
maxCntMLTLatDst.columns = ["dst_bin", "maxCount"]
dstSapsMLTLatCountDF = pandas.merge( dstSapsMLTLatCountDF, maxCntMLTLatDst, \
                              on=["dst_bin"], how='inner')
dstSapsMLTLatCountDF.columns = ["dst_bin", "normMLT", "MLAT", "count", "maxCount"]
dstSapsMLTLatCountDF["probOcc"] = dstSapsMLTLatCountDF["count"]/dstSapsMLTLatCountDF["maxCount"]
# Filter out unwanted values
dstSapsMLTLatCountDF = dstSapsMLTLatCountDF[ \
                        dstSapsMLTLatCountDF["probOcc"] >= perCutoffMLTMLAT\
                        ].reset_index(drop=True)
velsDataDF = pandas.merge( velsDataDF, dstSapsMLTLatCountDF,\
                          on=["dst_bin", "normMLT", "MLAT"], how='inner' )
velsDataDF.to_csv("../data/processed-vels-geomag-extra.txt", sep=' ', index=False)
################## NEW METHOD ##################
################## NEW METHOD ##################
################## NEW METHOD ##################

################## OLD METHOD ##################
################## OLD METHOD ##################
################## OLD METHOD ##################
# countDF = velsDataDF.groupby([ "normMLT", "MLAT" ]).size().reset_index()
# countDF.columns = [ "normMLT", "MLAT", "count" ]
# # Choose only columns which have atleast 100 points
# countDF = countDF[ countDF["count"] >= numPointsCutoffMLTMLAT ].reset_index(drop=True)
# # Merge with velsDataDF to filter out unwanted values
# velsDataDF = pandas.merge( velsDataDF, countDF,\
#                           on=["normMLT", "MLAT"], how='inner' )
# velsDataDF.to_csv("../data/processed-vels-geomag-extra.txt", sep=' ', index=False)
# print velsDataDF.columns.tolist()
################## OLD METHOD ##################
################## OLD METHOD ##################
################## OLD METHOD ##################
velsDataDF.head()

Unnamed: 0,azim,azimStd,delMLT,goodFit,MLAT,normMLT,vSaps,velSTD,endPtMLAT,endPtNormMLT,...,dst_index,dst_bin,datetimeStr,AE,AL,AO,AU,count,maxCount,probOcc
0,-1.278027,3.056854,1.0,True,56.5,0.0,439.581783,42.43072,56.51,-0.44,...,-67.0,"(-75, -50]",20131002-08-26,552,-285,-9,267,20,133,0.150376
1,-1.278027,3.056854,1.0,True,56.5,0.0,439.581783,42.43072,56.51,-0.44,...,-72.0,"(-75, -50]",20131002-08-26,552,-285,-9,267,20,133,0.150376
2,-13.713678,,,False,56.5,0.0,782.8379,,56.69,-0.76,...,-53.0,"(-75, -50]",20130525-06-54,401,-155,46,246,20,133,0.150376
3,-13.713678,,,False,56.5,0.0,782.8379,,56.69,-0.76,...,-58.0,"(-75, -50]",20130525-06-54,401,-155,46,246,20,133,0.150376
4,11.651055,4.944216,1.0,True,56.5,0.0,501.724761,24.297278,56.4,-0.49,...,-53.0,"(-75, -50]",20130525-06-56,389,-162,33,227,20,133,0.150376
