In [1]:
import pandas
import datetime
import numpy
from scipy.optimize import curve_fit
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib import ticker
import os
%matplotlib inline

In [13]:
datFileName = "../data/processedSaps-new.txt"
sapsDataDF = pandas.read_csv(datFileName, sep=' ')
# add dst_bins
dstBins = [ -150, -75, -50, -25, -10, 10 ]
sapsDataDF = pandas.concat( [ sapsDataDF, \
                    pandas.cut( sapsDataDF["dst_index"], \
                               bins=dstBins ) ], axis=1 )
sapsDataDF.columns = [ ["dateStr", "hour","sapsLat", \
                     "sapsMLT", "sapsVel", "radId", "poesLat",\
                     "poesMLT", "dst_date", "dst_index", "time", "dst_bin"] ]
# get a csv file with fewer columns for getting grid file data
sapsDataCsv = sapsDataDF[ ["dateStr", "sapsLat", "sapsMLT", "radId", "time"] ]
sapsDataCsv.to_csv("../data/data_for_grid.txt", sep=' ', index=False)
# sapsDataDF["time"] = sapsRawDF["date"].map(lambda x: x.strftime('%H%M'))
sapsDataDF.head()

Unnamed: 0,dateStr,hour,sapsLat,sapsMLT,sapsVel,radId,poesLat,poesMLT,dst_date,dst_index,time,dst_bin
0,20110107,0,56.5,17.7543,308.2077,33.0,62.0082,18.0,2011-01-07 00:00:00,-18.0,0,"(-25, -10]"
1,20110107,0,55.5,18.0147,224.1588,33.0,62.0082,18.0,2011-01-07 00:00:00,-18.0,0,"(-25, -10]"
2,20110107,0,56.5,17.8749,307.4328,33.0,62.0082,18.0,2011-01-07 00:00:00,-18.0,0,"(-25, -10]"
3,20110107,0,55.5,18.1324,222.4787,33.0,62.0082,18.0,2011-01-07 00:00:00,-18.0,0,"(-25, -10]"
4,20110107,0,56.5,17.9955,305.4201,33.0,62.0082,18.0,2011-01-07 00:00:00,-18.0,0,"(-25, -10]"


In [14]:
grdFileName = "../data/gridVecs.txt"
prcsdGrdFileName = "../data/gridVecs-processed.txt"
# os.system( 'cat ' + grdFileName + ' | tr -s "  " > ' + "../data/gridVecs-processed.txt" )
inpColNames = [ "dateStr", "time", "sapsLat", "sapsLon", \
          "grdVelMagn", "grdVelAzim", "radId", "sapsMLT"]
gridVecsDF = pandas.read_csv(prcsdGrdFileName, sep=' ',\
                             header=None, names=inpColNames)
gridVecsDF.head()

Unnamed: 0,dateStr,time,sapsLat,sapsLon,grdVelMagn,grdVelAzim,radId,sapsMLT
0,20110107,0,56.5,344.6231,83.6037,-47.4307,33,17.7543
1,20110107,0,55.5,348.5294,96.6077,-42.6488,33,18.0147
2,20110107,0,56.5,346.4322,84.6154,-44.5228,33,17.8749
3,20110107,0,55.5,350.2941,95.6795,-35.1175,33,18.1324
4,20110107,0,56.5,348.2412,84.616,-39.3513,33,17.9955


In [15]:
# Merge the grid vectors DF and saps data DF
print "saps DF size-->", sapsDataDF.shape
print "grid DF size before merge-->", gridVecsDF.shape
gridVecsDF = pandas.merge( gridVecsDF, sapsDataDF,\
                          on=[ "dateStr", "time", "sapsLat", "radId", "sapsMLT" ] )
print "grid DF size after merge-->", gridVecsDF.shape

saps DF size--> (173136, 12)
grid DF size before merge--> (167653, 8)
grid DF size after merge--> (163444, 15)


In [16]:
# Assuming a perfectly westward flow for SAPS 
# estimate the velocities inside the channel
sapsAzim = -90.
gridVecsDF["estSapsVels"] = numpy.abs( gridVecsDF["grdVelMagn"]/ numpy.cos( \
                                numpy.deg2rad( sapsAzim - gridVecsDF["grdVelAzim"] ) ) )
# only consider L-o-S velocities greater than 150 m/s
gridVecsDF = gridVecsDF[ gridVecsDF["grdVelMagn"] >= 100. ].reset_index(drop=True)
# filter out all vectors where velocities are greater than 2000 m/s
gridVecsDF["sapsMLTRounded"] = gridVecsDF["sapsMLT"].map(lambda x: round(x) )
gridVecsDF = gridVecsDF[ gridVecsDF["estSapsVels"] <= 2500.].reset_index(drop=True)
gridVecsDF.head()

Unnamed: 0,dateStr,time,sapsLat,sapsLon,grdVelMagn,grdVelAzim,radId,sapsMLT,hour,sapsVel,poesLat,poesMLT,dst_date,dst_index,dst_bin,estSapsVels,sapsMLTRounded
0,20110107,0,59.5,343.2787,517.6003,-43.2727,33,17.6647,0,404.4858,62.0082,18.0,2011-01-07 00:00:00,-18.0,"(-25, -10]",755.101113,18.0
1,20110107,0,57.5,349.7409,104.1324,-30.7649,33,18.0955,0,368.7867,62.0082,18.0,2011-01-07 00:00:00,-18.0,"(-25, -10]",203.575813,18.0
2,20110107,0,59.5,345.2459,519.254,-37.6651,33,17.7958,0,405.7251,62.0082,18.0,2011-01-07 00:00:00,-18.0,"(-25, -10]",849.780327,18.0
3,20110107,0,58.5,349.4681,108.4252,-28.0684,33,18.0773,0,404.4406,62.0082,18.0,2011-01-07 00:00:00,-18.0,"(-25, -10]",230.43437,18.0
4,20110107,0,57.5,351.6062,124.2545,-23.8893,33,18.2199,0,363.6906,62.0082,18.0,2011-01-07 00:00:00,-18.0,"(-25, -10]",306.823324,18.0


In [17]:
# filter out unwanted data
# Basically those where prob of occ is very less
dstGrps = gridVecsDF.groupby(["dst_bin", "sapsMLTRounded", "sapsLat"])
dstSapsMLTLatCountDF = pandas.DataFrame( dstGrps["sapsVel"].count() ).reset_index()
maxCntMLTLatDst = dstSapsMLTLatCountDF.groupby(["dst_bin"]).max().reset_index()
maxCntMLTLatDst = maxCntMLTLatDst.drop(["sapsMLTRounded", "sapsLat"], 1)
maxCntMLTLatDst.columns = ["dst_bin", "maxCount"]
dstSapsMLTLatCountDF = pandas.merge( dstSapsMLTLatCountDF, maxCntMLTLatDst, \
                              on=["dst_bin"], how='inner')
dstSapsMLTLatCountDF.columns = ["dst_bin", "sapsMLT", "sapsLat", "dataCount", "maxCount"]
dstSapsMLTLatCountDF["probOcc"] = dstSapsMLTLatCountDF["dataCount"]/dstSapsMLTLatCountDF["maxCount"]
dstSapsMLTLatCountDF = dstSapsMLTLatCountDF[]
dstSapsMLTLatCountDF.head()

Unnamed: 0,dst_bin,sapsMLT,sapsLat,dataCount,maxCount,probOcc
0,"(-10, 10]",0.0,50.5,1,737,0.001357
1,"(-10, 10]",0.0,54.5,2,737,0.002714
2,"(-10, 10]",0.0,55.5,10,737,0.013569
3,"(-10, 10]",0.0,56.5,26,737,0.035278
4,"(-10, 10]",0.0,57.5,79,737,0.107191


In [18]:
velLocGrps = gridVecsDF.groupby(\
                ["dst_bin", "sapsMLTRounded", "sapsLat"])
maxVelLoc = velLocGrps["estSapsVels"].max()
medianVelLoc = velLocGrps["estSapsVels"].median()
meanVelLoc = velLocGrps["estSapsVels"].mean()
stdVelLoc = velLocGrps["estSapsVels"].std()
velLocDF = pandas.concat([ maxVelLoc, medianVelLoc, \
                          meanVelLoc, stdVelLoc], axis=1).reset_index()
velLocDF.columns = [ "dst_bin", "sapsMLT", \
                    "sapsLat", "max_vel", "med_vel",\
                    "mean_vel", "std_vel"]
velLocDF['normMLT'] = [x-24 if x >= 12 else x for x in velLocDF['sapsMLT']]
# merge with mostlikely location DF to choose only the 
# DF where prob of Occ is gt 0.25
velLocDF = pandas.merge( velLocDF, dstSapsMLTLatCountDF,
                              on=["dst_bin", "sapsMLT", "sapsLat"] )
# Only choose locations where probOcc > 0.25
velLocDF = velLocDF[ velLocDF["probOcc"] > 0.25 ]
velLocDF.head()

Unnamed: 0,dst_bin,sapsMLT,sapsLat,max_vel,med_vel,mean_vel,std_vel,normMLT,dataCount,maxCount,probOcc
6,"(-10, 10]",0.0,59.5,1172.481298,301.334863,370.851056,201.813069,0.0,226,737,0.306649
7,"(-10, 10]",0.0,60.5,1683.578338,301.017709,381.930692,252.742325,0.0,268,737,0.363636
8,"(-10, 10]",0.0,61.5,1976.069233,315.40623,425.870125,310.188966,0.0,291,737,0.394844
9,"(-10, 10]",0.0,62.5,2267.615551,346.869158,447.443326,300.229211,0.0,254,737,0.34464
17,"(-10, 10]",1.0,57.5,1692.86801,302.584951,367.155872,221.731595,1.0,216,737,0.29308
