In [1]:
import pandas
import datetime
import numpy
from scipy.optimize import curve_fit
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.colors import Normalize
from matplotlib import ticker
from matplotlib import rcParams
%matplotlib inline

In [2]:
# setup some cutoff values we'll use in the analysis
velCutoffUpper = 2000.
velCutoffLower = 0.
numPointsCutoffMLTMLAT = 250
mlatCutOffUpper = 70.
probOccCutoff = 0.2

In [3]:
velGmagDF = pandas.read_csv("../data/processed-vels-geomag.txt", sep=' ')
velGmagDF = velGmagDF.drop('Unnamed: 0', axis=1)
# Discard unwanted values
# We'll only consider those velocities 
# which lie between 0 and 2500 m/s
# and located below 70 MLAT
velGmagDF = velGmagDF[ (velGmagDF["vSaps"] > velCutoffLower) \
                        & (velGmagDF["vSaps"] < velCutoffUpper)\
                       ].reset_index(drop=True)
velGmagDF = velGmagDF[ velGmagDF["MLAT"] < mlatCutOffUpper ].reset_index(drop=True)
# Now filter out velocities which have very few rate of occ.
# We calculat the prob and remove every measurement below 0.2 prob of occ.
mlatMLTDstCountDF = velGmagDF.groupby( ["MLAT", "normMLT", "dst_bin"] )["vSaps"].count().reset_index()
mlatMLTDstCountDF.columns = [ "MLAT", "normMLT", "dst_bin", "count" ]
dstMaxCntDF = mlatMLTDstCountDF.groupby( ["dst_bin"] )["count"].max().reset_index()
dstMaxCntDF.columns = [ "dst_bin", "maxCntDst" ]
mlatMLTDstCountDF = pandas.merge( mlatMLTDstCountDF, dstMaxCntDF, on=[ "dst_bin" ] )
mlatMLTDstCountDF["probOcc"] = mlatMLTDstCountDF["count"]/mlatMLTDstCountDF["maxCntDst"]
mlatMLTDstCountDF = mlatMLTDstCountDF[ mlatMLTDstCountDF["probOcc"] > probOccCutoff ].reset_index(drop=True)
# Filter out MLATs and MLTs (at the Dst bins)
# where number of measurements is low. We do
# this by merging the mlatMLTDstCountDF with velDF.
velGmagDF = pandas.merge( velGmagDF,\
                         mlatMLTDstCountDF,\
                         on=[ "MLAT", "normMLT", "dst_bin" ] )
velGmagDF = velGmagDF[ [ "normMLT", "MLAT", "vSaps",\
                        "azim", "dst_bin", "dst_index", "count", "maxCntDst" ] ]
# Divide the velocities into bins
velBins = [ v for v in range(0,int(velCutoffUpper)+100,100) ]
velGmagDF = pandas.concat( [ velGmagDF, \
                    pandas.cut( velGmagDF["vSaps"], \
                               bins=velBins ) ], axis=1 )
velGmagDF.columns = [ "normMLT", "MLAT", "vSaps",\
                        "azim", "dst_bin", "dst_index", "count",\
                         "maxCntDst", "vel_bin" ]
# velGmagDF.head()
# Get a DF with mean Dst in each bin
dstMeanDF = velGmagDF.groupby( ["dst_bin"] ).mean()["dst_index"].astype(int).reset_index()
dstMeanDF.columns = [ "dst_bin", "dst_mean" ]
velGmagDF = pandas.merge( velGmagDF, dstMeanDF, on=["dst_bin"] )
velGmagDF = velGmagDF.sort( ["dst_mean", "vSaps"], ascending=[False,True] ).reset_index(drop=True)
velCatOrderd = ['(0, 100]', '(100, 200]', '(200, 300]', '(300, 400]', '(400, 500]',\
          '(500, 600]', '(600, 700]', '(700, 800]',\
          '(800, 900]', '(900, 1000]', '(1000, 1100]', '(1100, 1200]',\
          '(1200, 1300]', '(1300, 1400]', '(1400, 1500]',\
          '(1500, 1600]', '(1600, 1700]', '(1700, 1800]',\
          '(1800, 1900]', '(1900, 2000]' ]

velGmagDF['vel_bin'] = pandas.Categorical(
    velGmagDF['vel_bin'], 
    categories=velCatOrderd, 
    ordered=True
)
# We need only a few cols
velGmagDF = velGmagDF[ [ "normMLT", "MLAT", "vSaps", "azim", "dst_index",\
                        "vel_bin", "dst_bin", "dst_mean" ] ]
velGmagDF.head()



Unnamed: 0,normMLT,MLAT,vSaps,azim,dst_index,vel_bin,dst_bin,dst_mean
0,1.0,58.5,164.32,-19.01,-7.0,"(100, 200]","(-10, 10]",-3
1,1.0,57.0,171.45,-16.11,-1.0,"(100, 200]","(-10, 10]",-3
2,1.0,59.0,174.58,-14.8,-7.0,"(100, 200]","(-10, 10]",-3
3,1.0,59.0,176.98,-6.09,-7.0,"(100, 200]","(-10, 10]",-3
4,2.0,59.5,178.11,-6.2,3.0,"(100, 200]","(-10, 10]",-3


In [4]:
# # Loop through each dst_bin, mlat and mlt....calculate the KDEs
# allDstBins = velGmagDF["dst_bin"].unique()
# allNormMLts = velGmagDF["normMLT"].unique()
# allMlats = velGmagDF["MLAT"].unique()
# # We'll loop through and calculate KDEs at each MLAT, MLT and Dst_bin
# # Store them in a DF
# velsKdeDF = pandas.DataFrame(columns=["dst_bin", "vel_arr", "pdf", "normMLT", "MLAT", "dst_mean"])
# dstBinArr = []
# dstMeanArr = []
# normMLTArr = []
# mlatArr = []
# velArr = []
# pdfArr = []
# # Define a inpVel arr to calculate kde
# inpVelsArr = numpy.arange(0.,2100.,100.)
# for cDst in allDstBins:
#     for cNmlt in allNormMLts:
#         for cMlat in allMlats:
#             currVelDF = velGmagDF[ (velGmagDF["dst_bin"] == cDst) &\
#                    (velGmagDF["MLAT"] == cMlat) &\
#                    (velGmagDF["normMLT"] == cNmlt)].reset_index(drop=True)
#             if currVelDF.shape[0] == 0:
#                 continue
#             velKernel = stats.gaussian_kde( currVelDF["vSaps"] )
#             pdfKernArr = velKernel.pdf( inpVelsArr )
#             dstBinArr.append( cDst )
#             normMLTArr.append( cNmlt )
#             mlatArr.append( cMlat )
#             velArr.append( inpVelsArr )
#             pdfArr.append( pdfKernArr )
#             dstMeanArr.append( currVelDF["dst_mean"].unique()[0] )
            
# velsKdeDF["dst_bin"] = dstBinArr
# velsKdeDF["vel_arr"] = velArr
# velsKdeDF["pdf"] = pdfArr
# velsKdeDF["normMLT"] = normMLTArr
# velsKdeDF["MLAT"] = mlatArr
# velsKdeDF["dst_mean"] = dstMeanArr
# velsKdeDF.head()

Unnamed: 0,dst_bin,vel_arr,pdf,normMLT,MLAT,dst_mean
0,"(-10, 10]","[0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0...","[6.5283793074e-06, 0.000156475538372, 0.000980...",1.0,58.5,-3
1,"(-10, 10]","[0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0...","[1.47743217211e-05, 0.000374718445351, 0.00185...",1.0,57.0,-3
2,"(-10, 10]","[0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0...","[4.33853910029e-06, 0.00011818432084, 0.000853...",1.0,59.0,-3
3,"(-10, 10]","[0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0...","[4.37155639646e-07, 4.4464326186e-05, 0.000616...",1.0,59.5,-3
4,"(-10, 10]","[0.0, 100.0, 200.0, 300.0, 400.0, 500.0, 600.0...","[1.14815234148e-05, 0.000264571695572, 0.00143...",1.0,57.5,-3


In [17]:
# Loop through each dst_bin, mlat and mlt....calculate the KDEs
allDstBins = velGmagDF["dst_bin"].unique()
allNormMLts = velGmagDF["normMLT"].unique()
allMlats = velGmagDF["MLAT"].unique()
# We'll loop through and calculate KDEs at each MLAT, MLT and Dst_bin
# Store them in a DF
velsKdeDF = pandas.DataFrame(columns=["dst_bin", "vel_arr", "pdf", "normMLT", "MLAT", "dst_mean"])
dstBinArr = []
dstMeanArr = []
normMLTArr = []
mlatArr = []
velArr = []
pdfArr = []
# Define a inpVel arr to calculate kde
inpVelsArr = numpy.arange(0.,2100.,100.)
for cDst in allDstBins:
    for cNmlt in allNormMLts:
        for cMlat in allMlats:
            currVelDF = velGmagDF[ (velGmagDF["dst_bin"] == cDst) &\
                   (velGmagDF["MLAT"] == cMlat) &\
                   (velGmagDF["normMLT"] == cNmlt)].reset_index(drop=True)
            if currVelDF.shape[0] == 0:
                continue
            velKernel = stats.gaussian_kde( currVelDF["vSaps"] )
            pdfKernArr = velKernel.pdf( inpVelsArr )
            for npd in range(pdfKernArr.shape[0]):
                dstBinArr.append( cDst )
                normMLTArr.append( cNmlt )
                mlatArr.append( cMlat )
                velArr.append( inpVelsArr[npd] )
                pdfArr.append( pdfKernArr[npd] )
                dstMeanArr.append( currVelDF["dst_mean"].unique()[0] )
            
velsKdeDF["dst_bin"] = dstBinArr
velsKdeDF["vel_arr"] = velArr
velsKdeDF["pdf"] = pdfArr
velsKdeDF["normMLT"] = normMLTArr
velsKdeDF["MLAT"] = mlatArr
velsKdeDF["dst_mean"] = dstMeanArr
velsKdeDF.head()

Unnamed: 0,dst_bin,vel_arr,pdf,normMLT,MLAT,dst_mean
0,"(-10, 10]",0.0,7e-06,1.0,58.5,-3
1,"(-10, 10]",100.0,0.000156,1.0,58.5,-3
2,"(-10, 10]",200.0,0.00098,1.0,58.5,-3
3,"(-10, 10]",300.0,0.002113,1.0,58.5,-3
4,"(-10, 10]",400.0,0.002154,1.0,58.5,-3


In [19]:
# Define the fitting function
# We know that the velocities are 
# exhibiting a skew normal distribution
def fit_vel_pdf((mlt,mlat, dst, inpVels), a_ascmlt, b_ascmlt, a_bscmlt, b_bscmlt, a_cscmlt, b_cscmlt,\
               a_asclat, b_asclat, a_bsclat, b_bsclat,\
               a_ashmlt, b_ashmlt, a_bshmlt, b_bshmlt,\
               a_ashlat, b_ashlat, a_bshlat, b_bshlat,\
               a_alcmlt, b_alcmlt, a_blcmlt, b_blcmlt,\
               a_alclat, b_alclat, a_blclat, b_blclat):
    
    # model scale parameters
    # mlt
    a_scale_mlt = a_ascmlt + b_ascmlt * dst
    b_scale_mlt = a_bscmlt + b_bscmlt * dst
    c_scale_mlt = a_cscmlt + b_cscmlt * dst
    # mlat
    a_scale_mlat = a_asclat + b_asclat * dst
    b_scale_mlat = a_bsclat + b_bsclat * dst
    # func
    scale = ( a_scale_mlt + b_scale_mlt*(mlt) + c_scale_mlt*(mlt**2) ) * ( a_scale_mlat + b_scale_mlat*(mlat) )
    
    # model shape parameters
    # mlt
    a_shape_mlt = a_ashmlt + b_ashmlt * dst
    b_shape_mlt = a_bshmlt + b_bshmlt * dst
    # mlat
    a_shape_mlat = a_ashlat + b_ashlat * dst
    b_shape_mlat = a_bshlat + b_bshlat * dst
    # func
    shape = ( a_shape_mlt + b_shape_mlt*(mlt) ) * ( a_shape_mlat + b_shape_mlat*(mlat) )
    
    # model loc parameters
    # mlt
    a_loc_mlt = a_alcmlt + b_alcmlt * dst
    b_loc_mlt = a_blcmlt + b_blcmlt * dst
    # malt
    a_loc_mlat = a_alclat + b_alclat * dst
    b_loc_mlat = a_blclat + b_blclat * dst
    # func
    loc = ( a_loc_mlt + b_loc_mlt*(mlt) ) * ( a_loc_mlat*numpy.exp(b_loc_mlat*mlat) )
    
    # we need to adjust the skewnormal distribution
    # to account fot loc and scale parameters
    inpData = (inpVels - loc)/scale
    skNrml = 2*stats.norm.pdf(inpData)*stats.norm.cdf(shape*inpData)
    return skNrml.ravel()

initGuess = ( -1e+4, -1e+4, 1e3, 1e3, -10, -10,\
             100, 100, 10, 10,\
             1, 1, 10, 10,\
            -1, -1, 100, 100,\
            -100, -100, 100, 100,\
            4, 4, 0.1, 0.1)
popt2, pcov2 = curve_fit(fit_vel_pdf, (velsKdeDF['MLAT'].T,velsKdeDF['normMLT'].T,velsKdeDF['dst_mean'].T,\
                                       velsKdeDF['vel_arr'].T), velsKdeDF['pdf'], p0=initGuess)
print popt2



[  1.81279171e+07   1.93652046e+04   1.90879322e+05   2.59587572e+03
   5.08050175e+02   3.90666285e+01  -2.17901502e+05  -2.65142096e+03
   7.34784439e+04   1.44995005e+04  -2.07575366e+01  -1.92963480e+01
  -1.32645784e+03  -8.71622976e+02   8.58992665e+01   1.29636048e+02
  -6.18879148e+01  -9.61897284e+02  -8.28997345e+06  -1.20774415e+05
  -1.74901014e+05  -1.20689409e+03  -8.82786565e+03  -3.77392763e+01
   2.73069706e+02   1.97731228e+00]
