In [7]:
import pandas
import datetime
import numpy
from scipy.optimize import curve_fit
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib import ticker
%matplotlib inline

In [8]:
datFileName = "../data/processedSaps-new.txt"
sapsDataDF = pandas.read_csv(datFileName, sep=' ')
# add dst_bins
dstBins = [ -150, -75, -50, -25, -10, 10 ]
sapsDataDF = pandas.concat( [ sapsDataDF, \
                    pandas.cut( sapsDataDF["dst_index"], \
                               bins=dstBins ) ], axis=1 )
sapsDataDF.columns = [ ["dateStr", "hour","sapsLat", \
                     "sapsMLT", "sapsVel", "radId", "poesLat",\
                     "poesMLT", "dst_date", "dst_index", "time", "dst_bin"] ]
sapsDataDF = sapsDataDF.drop(["hour", "radId", "poesLat", "poesMLT", "dst_date"], 1)
sapsDataDF.head()

Unnamed: 0,dateStr,sapsLat,sapsMLT,sapsVel,dst_index,time,dst_bin
0,20110107,56.5,17.7543,308.2077,-18.0,0,"(-25, -10]"
1,20110107,55.5,18.0147,224.1588,-18.0,0,"(-25, -10]"
2,20110107,56.5,17.8749,307.4328,-18.0,0,"(-25, -10]"
3,20110107,55.5,18.1324,222.4787,-18.0,0,"(-25, -10]"
4,20110107,56.5,17.9955,305.4201,-18.0,0,"(-25, -10]"


In [9]:
# Calculate the median dst in each dst bin
dstBinMedians = sapsDataDF.groupby(["dst_bin"]).median()
dstBinMedians = dstBinMedians.drop(["dateStr", "sapsLat", "sapsMLT", \
                                "sapsVel", "time"], 1)
dstBinMedians["dst_bin"] = dstBinMedians.index
dstBinMedians.reset_index(drop=True, inplace=True)
dstBinMedians.columns = [ "dst_median", "dst_bin" ]
dstBinMedians.head()

Unnamed: 0,dst_median,dst_bin
0,-95.0,"(-150, -75]"
1,-57.0,"(-75, -50]"
2,-36.0,"(-50, -25]"
3,-18.0,"(-25, -10]"
4,-5.0,"(-10, 10]"


In [12]:
# calculate prob of occ by dst_bin, MLT, Lat
sapsDataDF["sapsMLTRounded"] = sapsDataDF["sapsMLT"].map(lambda x: round(x) )
# get a normalized form of MLT where 
# if MLT > 12: MLT = MLT - 24, else MLT = MLT
sapsDataDF['normMLT'] = [x-24 if x >= 12 else x for x in sapsDataDF['sapsMLTRounded']]
sapsDataDF['normLAT'] = [x-57.5 for x in sapsDataDF['sapsLat']]
# Get max points at a given Lat, MLT, DstBin
dstGrps = sapsDataDF.groupby(["dst_bin", "sapsMLTRounded", "sapsLat", "normMLT", "normLAT"])
dstSapsMLTLatCountDF = pandas.DataFrame( dstGrps["sapsVel"].count() ).reset_index()
maxCntMLTLatDst = dstSapsMLTLatCountDF.groupby(["dst_bin"]).max().reset_index()
maxCntMLTLatDst = maxCntMLTLatDst.drop(["sapsMLTRounded", "sapsLat", "normMLT", "normLAT"], 1)
maxCntMLTLatDst.columns = ["dst_bin", "maxCount"]
dstSapsMLTLatCountDF = pandas.merge( dstSapsMLTLatCountDF, maxCntMLTLatDst, \
                              on=["dst_bin"], how='inner')
dstSapsMLTLatCountDF.columns = ["dst_bin", "sapsMLT", "sapsLat", "normMLT", "normLAT", "dataCount", "maxCount"]
dstSapsMLTLatCountDF["MLT"] = dstSapsMLTLatCountDF["sapsMLT"].map(lambda x: str(int(x)) )
dstSapsMLTLatCountDF["probOcc"] = dstSapsMLTLatCountDF["dataCount"]/dstSapsMLTLatCountDF["maxCount"]
# Also assign a median dst value for each dst bin
dstSapsMLTLatCountDF = pandas.merge( dstBinMedians, dstSapsMLTLatCountDF, on = "dst_bin")
dstSapsMLTLatCountDF.head()

Unnamed: 0,dst_median,dst_bin,sapsMLT,sapsLat,normMLT,normLAT,dataCount,maxCount,MLT,probOcc
0,-95.0,"(-150, -75]",0.0,51.5,0.0,-6.0,1,333,0,0.003003
1,-95.0,"(-150, -75]",0.0,52.5,0.0,-5.0,22,333,0,0.066066
2,-95.0,"(-150, -75]",0.0,53.5,0.0,-4.0,26,333,0,0.078078
3,-95.0,"(-150, -75]",0.0,54.5,0.0,-3.0,36,333,0,0.108108
4,-95.0,"(-150, -75]",0.0,55.5,0.0,-2.0,50,333,0,0.15015


In [5]:
sapsModelDF = pandas.DataFrame(columns=["normMLT", "normLAT", "probSAPS", "MLT", "Lat"])
latArr = []
mltArr = []
dstBinArr = []
for z in range( len( dstSapsMLTLatCountDF["dst_bin"].unique() ) ):
    for x in range( int(dstSapsMLTLatCountDF["normLAT"].min()), int(dstSapsMLTLatCountDF["normLAT"].max()) + 1 ):
        for y in range( int(dstSapsMLTLatCountDF["normMLT"].min()), int(dstSapsMLTLatCountDF["normMLT"].max()) + 1 ):
            latArr.append(x)
            mltArr.append(y)
            dstBinArr.append(dstSapsMLTLatCountDF["dst_bin"].unique()[z])
sapsModelDF["normMLT"] = mltArr
sapsModelDF["normLAT"] = latArr
sapsModelDF["dst_bin"] = dstBinArr
sapsModelDF = pandas.merge( sapsModelDF, dstSapsMLTLatCountDF, on=["normMLT", "normLAT", "dst_bin"], how="outer" )
sapsModelDF["probOcc"] = sapsModelDF["probOcc"].fillna(0.02)
# sapsModelDF = sapsModelDF[ ["normMLT", "normLAT", "probOcc"] ]
# Also have a mean value of dst for each bin
# this will be useful for modeling purposes
# sapsModelDF["meanDst"] = sapsModelDF.apply (lambda row: mean_dst_value(row),axis=1)
sapsModelDF[ sapsModelDF["probOcc"] > 0.5 ].head()

Unnamed: 0,normMLT,normLAT,probSAPS,MLT_x,Lat,dst_bin,sapsMLT,sapsLat,dataCount,maxCount,MLT_y,probOcc
74,-4,-4,,,,"(-150, -75]",20.0,53.5,181.0,333.0,20,0.543544
75,-3,-4,,,,"(-150, -75]",21.0,53.5,178.0,333.0,21,0.534535
95,-5,-3,,,,"(-150, -75]",19.0,54.5,182.0,333.0,19,0.546547
96,-4,-3,,,,"(-150, -75]",20.0,54.5,328.0,333.0,20,0.984985
97,-3,-3,,,,"(-150, -75]",21.0,54.5,268.0,333.0,21,0.804805
