In [1]:
import pandas
import datetime
import numpy
import scipy.optimize
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
import bs4
import urllib
from matplotlib.colors import ListedColormap
from matplotlib.colors import Normalize
from matplotlib import ticker
%matplotlib inline
from matplotlib import rcParams

In [2]:
# setup some cutoff values we'll use in the analysis
velCutoffUpper = 2000.
velCutoffLower = 0.
numPointsCutoffMLTMLAT = 250
mlatCutOffUpper = 70.

In [3]:
velGmagDF = pandas.read_csv("../data/processed-vels-geomag.txt", sep=' ',\
                            parse_dates=["date", "dst_date"], infer_datetime_format=True)
velGmagDF = velGmagDF.drop('Unnamed: 0', axis=1)
# Filter out some values where number of datapoints are pretty low.
countDF = velGmagDF.groupby([ "normMLT", "MLAT" ]).size().reset_index()
countDF.columns = [ "normMLT", "MLAT", "count" ]
# Choose only columns which have atleast 100 points
countDF = countDF[ countDF["count"] >= numPointsCutoffMLTMLAT ].reset_index(drop=True)
# Merge with velsDataDF to filter out unwanted values
velGmagDF = pandas.merge( velGmagDF, countDF,\
                          on=["normMLT", "MLAT"], how='inner' )
# Discard unwanted values
# We'll only consider those velocities 
# which lie between 0 and 2500 m/s
# and located below 70 MLAT
velGmagDF = velGmagDF[ (velGmagDF["vSaps"] > velCutoffLower) \
                        & (velGmagDF["vSaps"] < velCutoffUpper)\
                       ].reset_index(drop=True)
velGmagDF = velGmagDF[ velGmagDF["MLAT"] < mlatCutOffUpper ].reset_index(drop=True)
velGmagDF.head()

Unnamed: 0,normMLT,MLAT,vSaps,azim,vMagnErr,azimErr,dtStr,tmStr,date,hour,dst_date,dst_index,dst_bin,AE,AL,AO,AU,minute,count
0,-4,60.5,634.32,-10.1,7.19,21.54,20130316,500,2013-03-16 05:00:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",731,-575,-210,156,0,2677
1,-4,60.5,609.64,-10.92,3.58,9.73,20130316,502,2013-03-16 05:02:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",651,-524,-199,127,2,2677
2,-4,60.5,613.71,-12.2,4.15,8.27,20130316,504,2013-03-16 05:04:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",586,-470,-177,116,4,2677
3,-4,60.5,778.5,-16.19,3.8,4.9,20130316,506,2013-03-16 05:06:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",562,-418,-137,144,6,2677
4,-4,60.5,578.11,-15.4,7.43,12.92,20130316,508,2013-03-16 05:08:00,5,2013-03-16 05:00:00,-19,"(-25, -10]",569,-450,-166,119,8,2677


In [4]:
# Lets get max, min and std.dev across each MLT for each time
# this way we'll try to isolate SAID channels
maxVelDF = velGmagDF.groupby( ["date", "normMLT"] )[ "vSaps", "azim", "MLAT" ].max().reset_index()
maxVelDF.rename(columns={'vSaps': 'vSapsMax','azim': 'azimMax','MLAT': 'mlatMax'}, inplace=True)

minVelDF = velGmagDF.groupby( ["date", "normMLT"] )[ "vSaps", "azim", "MLAT" ].min().reset_index()
minVelDF.rename(columns={'vSaps': 'vSapsMin','azim': 'azimMin','MLAT': 'mlatMin'}, inplace=True)

medianVelDF = velGmagDF.groupby( ["date", "normMLT"] )[ "vSaps", "azim", "MLAT" ].median().reset_index()
medianVelDF.rename(columns={'vSaps': 'vSapsMedian','azim': 'azimMedian','MLAT': 'mlatMedian'}, inplace=True)

stdVelDF = velGmagDF.groupby( ["date", "normMLT"] )[ "vSaps", "azim", "MLAT" ].std().reset_index()
stdVelDF.rename(columns={'vSaps': 'vSapsStd','azim': 'azimStd','MLAT': 'mlatStd'}, inplace=True)

sapsDF = maxVelDF.merge( minVelDF, on=["date","normMLT"] ).\
                merge( medianVelDF, on=["date","normMLT"] ).\
                merge( stdVelDF, on=["date","normMLT"] )

sapsDF.head()

Unnamed: 0,date,normMLT,vSapsMax,azimMax,mlatMax,vSapsMin,azimMin,mlatMin,vSapsMedian,azimMedian,mlatMedian,vSapsStd,azimStd,mlatStd
0,2011-02-05 02:30:00,-5,704.08,-7.26,63.5,227.38,-9.81,59.0,580.705,-8.355,61.25,146.690838,0.949695,1.513825
1,2011-02-05 02:30:00,-4,704.08,-7.26,63.0,543.42,-9.81,59.5,606.97,-7.96,61.25,58.850929,0.951502,1.224745
2,2011-02-05 02:30:00,-3,635.56,-5.38,63.0,531.04,-7.26,59.5,612.11,-6.355,61.25,36.448429,0.667014,1.224745
3,2011-02-05 02:32:00,-5,615.93,-4.76,63.5,213.15,-8.53,59.0,555.81,-7.26,61.25,161.409982,1.01458,1.513825
4,2011-02-05 02:32:00,-4,615.93,-4.76,63.0,498.32,-8.53,59.5,586.435,-6.955,61.25,45.000333,1.301498,1.224745


In [5]:
# We'll set up a few criteria for isolating SAID channels
# 1) SAID datetimes have max velocities greater than 1000 m/s
sapsDF = sapsDF[ sapsDF["vSapsMax"] >= 1000. ].reset_index(drop=True)
# 2) Latitude difference (max-min) is greater than 2 degrees
sapsDF = sapsDF[ (sapsDF["mlatMax"] - sapsDF["mlatMin"]) > 2. ].reset_index(drop=True)
# 3) Stdev in Velocities should be greater than 250 m/s
sapsDF = sapsDF[ sapsDF["vSapsStd"] > 250. ].reset_index(drop=True)
sapsDF.head()

Unnamed: 0,date,normMLT,vSapsMax,azimMax,mlatMax,vSapsMin,azimMin,mlatMin,vSapsMedian,azimMedian,mlatMedian,vSapsStd,azimStd,mlatStd
0,2011-03-02 02:26:00,-2,1923.54,-5.61,62.5,928.08,-12.97,59.0,1312.33,-7.7,60.5,341.606848,3.153589,1.205148
1,2011-03-02 02:28:00,-2,1594.03,-5.81,61.5,877.95,-15.31,59.0,1259.895,-8.625,60.25,292.061807,3.848966,0.935414
2,2011-03-12 00:50:00,-5,1179.78,4.11,61.0,188.28,2.68,58.5,942.67,3.205,59.75,346.923597,0.727225,0.935414
3,2011-03-12 00:50:00,-4,1128.93,4.25,61.0,489.12,2.68,58.0,1004.24,4.11,59.5,263.770741,0.803818,1.080123
4,2011-03-12 00:52:00,-4,1185.96,4.02,60.5,465.18,3.73,58.0,1024.165,3.735,59.25,250.021061,0.116862,0.935414


In [6]:
# Now merge the sapsDF dates with velGmagDF
sapsDF = pandas.merge( sapsDF, velGmagDF, on=["date", "normMLT"] )
# Now we need to select columns which have velocities close to vMax
# I'll set this to be within 100 m/s of vMax
sapsDF = sapsDF[ (sapsDF["vSapsMax"] - sapsDF["vSaps"] ) <= 100. ].reset_index()
selCols = [ "date", "normMLT", "vSaps", "vSapsMax", "vSapsMin", "vSapsMedian", "MLAT"]


In [7]:
dateMLTCountDF = sapsDF.groupby( ["date"] )["normMLT"].nunique()
# We'll consider only those dates where SAID channel is present for more than 1 MLT
# This will prevent any stray values from entering the results
selDateList = dateMLTCountDF[dateMLTCountDF>1].index.tolist()
# Select only those dates which are in the list
sapsDF = sapsDF[ sapsDF["date"].isin(selDateList) ].reset_index(drop=True)
print sapsDF[selCols]

                    date  normMLT    vSaps  vSapsMax  vSapsMin  vSapsMedian  \
0    2011-03-12 00:50:00       -5  1179.78   1179.78    188.28      942.670   
1    2011-03-12 00:50:00       -4  1092.67   1128.93    489.12     1004.240   
2    2011-03-12 00:50:00       -4  1128.93   1128.93    489.12     1004.240   
3    2011-03-12 00:50:00       -4  1092.67   1128.93    489.12     1004.240   
4    2011-04-03 04:08:00       -4  1234.03   1252.99    425.48     1101.470   
5    2011-04-03 04:08:00       -4  1244.73   1252.99    425.48     1101.470   
6    2011-04-03 04:08:00       -4  1252.99   1252.99    425.48     1101.470   
7    2011-04-03 04:08:00       -4  1252.99   1252.99    425.48     1101.470   
8    2011-04-03 04:08:00       -4  1245.52   1252.99    425.48     1101.470   
9    2011-04-03 04:08:00       -3  1450.65   1450.65    529.48      882.170   
10   2011-04-03 04:12:00       -5  1271.42   1353.90    535.96     1108.390   
11   2011-04-03 04:12:00       -5  1353.90   1353.90

In [8]:
# sapsDF[ sapsDF["date"] == datetime.datetime(2011,4,3,4,26) ][selCols]
latWidthDF = sapsDF.groupby( ["date", "normMLT"] )["MLAT"].aggregate(lambda x: max(x)-min(x)).reset_index()
selLatWidthDates = latWidthDF[ latWidthDF["MLAT"] <= 2. ]["date"].tolist()
saidDF = sapsDF[ sapsDF["date"].isin(selLatWidthDates) ].reset_index(drop=True) 
saidDF.head()

Unnamed: 0,index,date,normMLT,vSapsMax,azimMax,mlatMax,vSapsMin,azimMin,mlatMin,vSapsMedian,...,hour,dst_date,dst_index,dst_bin,AE,AL,AO,AU,minute,count
0,13,2011-03-12 00:50:00,-5,1179.78,4.11,61,188.28,2.68,58.5,942.67,...,0,2011-03-12 00:00:00,-60,"(-75, -50]",538,-333,-64,205,50,2066
1,21,2011-03-12 00:50:00,-4,1128.93,4.25,61,489.12,2.68,58.0,1004.24,...,0,2011-03-12 00:00:00,-60,"(-75, -50]",538,-333,-64,205,50,3091
2,22,2011-03-12 00:50:00,-4,1128.93,4.25,61,489.12,2.68,58.0,1004.24,...,0,2011-03-12 00:00:00,-60,"(-75, -50]",538,-333,-64,205,50,2919
3,23,2011-03-12 00:50:00,-4,1128.93,4.25,61,489.12,2.68,58.0,1004.24,...,0,2011-03-12 00:00:00,-60,"(-75, -50]",538,-333,-64,205,50,3120
4,70,2011-04-03 04:08:00,-4,1252.99,-13.03,65,425.48,-19.7,59.5,1101.47,...,4,2011-04-03 04:00:00,-35,"(-50, -25]",322,-175,-14,147,8,1960


In [9]:
saidDF["dateStr"] = saidDF["date"].map(lambda x: x.strftime('%Y%m%d'))
saidDF["timeStr"] = saidDF["date"].map(lambda x: x.strftime('%H%M'))
timesDF = saidDF.groupby( ["dateStr"] )["timeStr"].aggregate(lambda x: len(list(set(x)))).reset_index()
# Select only those days where we have atleast 10 unique times of SAPS measurements
selMultiTimeDates = timesDF[ timesDF["timeStr"] >= 10 ]["dateStr"].tolist()
saidDF = saidDF[ saidDF["dateStr"].isin(selMultiTimeDates) ].reset_index(drop=True) 
saidDF.head()

Unnamed: 0,index,date,normMLT,vSapsMax,azimMax,mlatMax,vSapsMin,azimMin,mlatMin,vSapsMedian,...,dst_index,dst_bin,AE,AL,AO,AU,minute,count,dateStr,timeStr
0,70,2011-04-03 04:08:00,-4,1252.99,-13.03,65,425.48,-19.7,59.5,1101.47,...,-35,"(-50, -25]",322,-175,-14,147,8,1960,20110403,408
1,71,2011-04-03 04:08:00,-4,1252.99,-13.03,65,425.48,-19.7,59.5,1101.47,...,-35,"(-50, -25]",322,-175,-14,147,8,1671,20110403,408
2,72,2011-04-03 04:08:00,-4,1252.99,-13.03,65,425.48,-19.7,59.5,1101.47,...,-35,"(-50, -25]",322,-175,-14,147,8,1282,20110403,408
3,75,2011-04-03 04:08:00,-4,1252.99,-13.03,65,425.48,-19.7,59.5,1101.47,...,-35,"(-50, -25]",322,-175,-14,147,8,959,20110403,408
4,76,2011-04-03 04:08:00,-4,1252.99,-13.03,65,425.48,-19.7,59.5,1101.47,...,-35,"(-50, -25]",322,-175,-14,147,8,708,20110403,408


In [10]:
testDF = saidDF.groupby( ["dst_bin"] ).mean()
printCols = [ "normMLT", "vSaps", "vSapsMax", "vSapsMin", "vSapsMedian", "MLAT"]
testDF[printCols].head()

Unnamed: 0_level_0,normMLT,vSaps,vSapsMax,vSapsMin,vSapsMedian,MLAT
dst_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(-10, 10]",-1.445344,1354.061862,1371.701296,368.617004,867.161559,62.888664
"(-150, -75]",-4.518253,1456.834739,1476.956943,517.407275,1005.997464,57.079205
"(-25, -10]",-3.1522,1288.095089,1307.992152,370.997729,720.152955,61.939358
"(-50, -25]",-3.70309,1299.428893,1320.245742,347.136952,767.369534,60.28764
"(-75, -50]",-3.998545,1367.125578,1387.12904,417.925556,817.524651,58.714909


In [11]:
saidDF[ saidDF["dst_bin"] == "(-10, 10]" ][selCols].head(10)

Unnamed: 0,date,normMLT,vSaps,vSapsMax,vSapsMin,vSapsMedian,MLAT
510,2011-07-20 06:18:00,-3,1093.8,1154.5,500.4,558.3,63.0
511,2011-07-20 06:18:00,-3,1154.5,1154.5,500.4,558.3,63.5
512,2011-07-20 06:18:00,-3,1154.5,1154.5,500.4,558.3,64.0
513,2011-07-20 06:18:00,-2,1093.8,1154.5,459.78,496.835,63.0
514,2011-07-20 06:18:00,-2,1154.5,1154.5,459.78,496.835,63.5
515,2011-07-20 06:18:00,-2,1154.5,1154.5,459.78,496.835,64.0
516,2011-07-20 06:18:00,0,1013.18,1094.52,260.02,765.535,60.5
517,2011-07-20 06:18:00,0,1094.52,1094.52,260.02,765.535,61.0
518,2011-07-20 06:22:00,-1,1090.48,1090.48,347.34,448.54,61.0
519,2011-07-20 06:22:00,-1,1051.69,1090.48,347.34,448.54,61.5
