In [1]:
import pandas
import datetime
import numpy
from scipy.optimize import curve_fit
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.colors import Normalize
from matplotlib import ticker
from matplotlib import rcParams
%matplotlib inline

In [2]:
# setup some cutoff values we'll use in the analysis
velCutoffUpper = 2000.
velCutoffLower = 0.
numPointsCutoffMLTMLAT = 250
mlatCutOffUpper = 70.
probOccCutoff = 0.2

In [3]:
velGmagDF = pandas.read_csv("../data/processed-vels-geomag.txt", sep=' ')
velGmagDF = velGmagDF.drop('Unnamed: 0', axis=1)
# Discard unwanted values
# We'll only consider those velocities 
# which lie between 0 and 2500 m/s
# and located below 70 MLAT
velGmagDF = velGmagDF[ (velGmagDF["vSaps"] > velCutoffLower) \
                        & (velGmagDF["vSaps"] < velCutoffUpper)\
                       ].reset_index(drop=True)
velGmagDF = velGmagDF[ velGmagDF["MLAT"] < mlatCutOffUpper ].reset_index(drop=True)
# Now filter out velocities which have very few rate of occ.
# We calculat the prob and remove every measurement below 0.2 prob of occ.
mlatMLTDstCountDF = velGmagDF.groupby( ["MLAT", "normMLT", "dst_bin"] )["vSaps"].count().reset_index()
mlatMLTDstCountDF.columns = [ "MLAT", "normMLT", "dst_bin", "count" ]
dstMaxCntDF = mlatMLTDstCountDF.groupby( ["dst_bin"] )["count"].max().reset_index()
dstMaxCntDF.columns = [ "dst_bin", "maxCntDst" ]
mlatMLTDstCountDF = pandas.merge( mlatMLTDstCountDF, dstMaxCntDF, on=[ "dst_bin" ] )
mlatMLTDstCountDF["probOcc"] = mlatMLTDstCountDF["count"]/mlatMLTDstCountDF["maxCntDst"]
mlatMLTDstCountDF = mlatMLTDstCountDF[ mlatMLTDstCountDF["probOcc"] > probOccCutoff ].reset_index(drop=True)
# Filter out MLATs and MLTs (at the Dst bins)
# where number of measurements is low. We do
# this by merging the mlatMLTDstCountDF with velDF.
velGmagDF = pandas.merge( velGmagDF,\
                         mlatMLTDstCountDF,\
                         on=[ "MLAT", "normMLT", "dst_bin" ] )
velGmagDF = velGmagDF[ [ "normMLT", "MLAT", "vSaps",\
                        "azim", "dst_bin", "dst_index", "count", "maxCntDst" ] ]
# Divide the velocities into bins
velBins = [ v for v in range(0,int(velCutoffUpper)+100,100) ]
velGmagDF = pandas.concat( [ velGmagDF, \
                    pandas.cut( velGmagDF["vSaps"], \
                               bins=velBins ) ], axis=1 )
velGmagDF.columns = [ "normMLT", "MLAT", "vSaps",\
                        "azim", "dst_bin", "dst_index", "count",\
                         "maxCntDst", "vel_bin" ]
# velGmagDF.head()
# Get a DF with mean Dst in each bin
dstMeanDF = velGmagDF.groupby( ["dst_bin"] ).mean()["dst_index"].astype(int).reset_index()
dstMeanDF.columns = [ "dst_bin", "dst_mean" ]
velGmagDF = pandas.merge( velGmagDF, dstMeanDF, on=["dst_bin"] )
velGmagDF = velGmagDF.sort( ["dst_mean"], ascending=False ).reset_index(drop=True)
velGmagDF.head()



Unnamed: 0,normMLT,MLAT,vSaps,azim,dst_bin,dst_index,count,maxCntDst,vel_bin,dst_mean
0,1.0,58.0,927.34,-12.52,"(-10, 10]",-6.0,293,508,"(900, 1000]",-3
1,-3.0,61.0,755.23,-17.91,"(-10, 10]",-8.0,277,508,"(700, 800]",-3
2,-3.0,61.0,743.78,-17.19,"(-10, 10]",-8.0,277,508,"(700, 800]",-3
3,-3.0,61.0,800.52,-14.51,"(-10, 10]",-8.0,277,508,"(800, 900]",-3
4,-3.0,61.0,535.88,-19.86,"(-10, 10]",-8.0,277,508,"(500, 600]",-3


In [11]:
# Define the fitting function
# We know that the velocities are 
# exhibiting a skew normal distribution
def fit_vel_pdf((mlt,mlat, dst, inpVels), a_ascmlt, b_ascmlt, a_bscmlt, b_bscmlt, a_cscmlt, b_cscmlt,\
               a_asclat, b_asclat, a_bsclat, b_bsclat,\
               a_ashmlt, b_ashmlt, a_bshmlt, b_bshmlt,\
               a_ashlat, b_ashlat, a_bshlat, b_bshlat,\
               a_alcmlt, b_alcmlt, a_blcmlt, b_blcmlt,\
               a_alclat, b_alclat, a_blclat, b_blclat):
    
    # model scale parameters
    # mlt
    a_scale_mlt = a_ascmlt + b_ascmlt * dst
    b_scale_mlt = a_bscmlt + b_bscmlt * dst
    c_scale_mlt = a_cscmlt + b_cscmlt * dst
    # mlat
    a_scale_mlat = a_asclat + b_asclat * dst
    b_scale_mlat = a_bsclat + b_bsclat * dst
    # func
    scale = ( a_scale_mlt + b_scale_mlt*(mlt) + c_scale_mlt*(mlt**2) ) * ( a_scale_mlat + b_scale_mlat*(mlat) )
    
    # model shape parameters
    # mlt
    a_shape_mlt = a_ashmlt + b_ashmlt * dst
    b_shape_mlt = a_bshmlt + b_bshmlt * dst
    # mlat
    a_shape_mlat = a_ashlat + b_ashlat * dst
    b_shape_mlat = a_bshlat + b_bshlat * dst
    # func
    shape = ( a_shape_mlt + b_shape_mlt*(mlt) ) * ( a_shape_mlat + b_shape_mlat*(mlat) )
    
    # model loc parameters
    # mlt
    a_loc_mlt = a_alcmlt + b_alcmlt * dst
    b_loc_mlt = a_blcmlt + b_blcmlt * dst
    # malt
    a_loc_mlat = a_alclat + b_alclat * dst
    b_loc_mlat = a_blclat + b_blclat * dst
    # func
    loc = ( a_loc_mlt + b_loc_mlt*(mlt) ) * ( a_loc_mlat*numpy.exp(b_loc_mlat*mlat) )
    
    # we need to adjust the skewnormal distribution
    # to account fot loc and scale parameters
    inpData = (inpVels - loc)/scale
    skNrml = 2*stats.norm.pdf(inpData)*stats.norm.cdf(shape*inpData)
     
    return skNrml.ravel()

initGuess = ( -1e+4, -1e+4, 1e3, 1e3, -10, -10,\
             100, 100, 10, 10,\
             1, 1, 10, 10,\
            -1, -1, 100, 100,\
            -100, -100, 100, 100,\
            4, 4, 0.1, 0.1)
popt2, pcov2 = curve_fit(fit_vel_pdf, (velGmagDF['MLAT'].T,velGmagDF['normMLT'].T,velGmagDF['dst_index'].T,\
                                       velGmagDF['vSaps'].T), velGmagDF['vSaps'], p0=initGuess)
print popt2

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0


[ -1.00000000e+04  -1.00000000e+04   1.00000000e+03   1.00000000e+03
  -1.00000000e+01  -1.00000000e+01   1.00000000e+02   1.00000000e+02
   1.00000000e+01   1.00000000e+01   1.00000000e+00   1.00000000e+00
   1.00000000e+01   1.00000000e+01  -1.00000000e+00  -1.00000000e+00
   1.00000000e+02   1.00000000e+02  -1.00000000e+02  -1.00000000e+02
   1.00000000e+02   1.00000000e+02   4.00000000e+00   4.00000000e+00
   1.00000000e-01   1.00000000e-01]




In [9]:
aa = [ -1,1,2,2,2,3,4 ]
print stats.norm.cdf(aa)


[ 0.15865525  0.84134475  0.97724987  0.97724987  0.97724987  0.9986501
  0.99996833]
