In [102]:
from os.path import expanduser
import pandas as pd
import simplejson
import numpy as np
import urllib
import json
import glob

# Generate Metadata File for 2008 to 2015

In [100]:
!ls ../data/meta

[34m2008[m[m [34m2009[m[m [34m2010[m[m [34m2011[m[m [34m2012[m[m [34m2013[m[m [34m2014[m[m [34m2015[m[m [34m2016[m[m


In [104]:
meta_dir = '../data/meta/*/d11/*text_meta_*.txt'
meta_files = glob.glob(meta_dir)
meta_file_list = []
for meta_file in meta_files:
    date = str('_'.join(meta_file.split('_')[4:7])).split('.')[0]
    df = pd.read_table(meta_file, index_col=None, header=0)
    date_col = pd.Series([date] * len(df))
    df['file_date'] = date_col
    meta_file_list.append(df)
    
print meta_files[0:5]
meta_frame = pd.concat(meta_file_list)

['../data/meta/2008/d11/d11_text_meta_2007_09_21.txt', '../data/meta/2008/d11/d11_text_meta_2008_03_06.txt', '../data/meta/2008/d11/d11_text_meta_2008_04_15.txt', '../data/meta/2008/d11/d11_text_meta_2008_04_16.txt', '../data/meta/2008/d11/d11_text_meta_2008_04_18.txt']


In [105]:
# the type in the meta data are just the detector types.  Need to analyze the "change" that cohort 1 referred to
no_dup_keep_last = meta_frame.drop_duplicates(subset='ID', keep='last') # TODO: assuming meta and 5min agree on freeway type...check?
print "unique count of stations: %s" % no_dup_keep_last.shape[0]

print "\ndistribution of Types of stations"
no_dup_keep_last.Type.value_counts()

unique count of stations: 1783

distribution of Types of stations


ML    979
OR    344
FR    263
HV    106
FF     81
CH      7
CD      3
Name: Type, dtype: int64

In [106]:
no_dup_keep_last.to_csv('../data/meta_2008_2015.csv')

In [107]:
no_dup_keep_last.Type.unique()

array(['ML', 'OR', 'FR', 'FF', 'HV', 'CD', 'CH'], dtype=object)

In [231]:
def create_freeway_vectors(frame_to_use, columns_to_select=['ID', 'Latitude', 'Longitude', 'Abs_PM', 'Lanes']):
    frame_to_use = frame_to_use[frame_to_use.Type == 'ML']
    to_loop = frame_to_use.groupby(['Fwy', 'Dir'])['ID'].count().reset_index()[['Fwy', 'Dir']].values

    ret = {}
    for Fwy, Dir in to_loop:
        if Dir == "N":
            sort_order = ('Abs_PM', True)        
        elif Dir == "S":
            sort_order = ('Abs_PM', True)        
        elif Dir == "E":
            sort_order = ('Abs_PM', True)        
        elif Dir == "W":
            sort_order = ('Abs_PM', True)
        
        
        tmp = frame_to_use[(frame_to_use.Fwy == Fwy) & (frame_to_use.Dir == Dir)]\
            .sort_values(by=sort_order[0], ascending=sort_order[1])[columns_to_select] # .drop_duplicates()
        tmp['order'] = pd.Series(index=tmp.index, data=sorted(range(0, len(tmp.ID)), reverse=(not sort_order[1])))
        ret["%s_%s" % (Fwy, Dir)] = tmp
    return ret 

In [206]:
def sorted_func(x):
    values = x.split('_')
    Fwy = int(values[0])
    Dir = values[1]
    if Dir == 'N' or Dir == 'E':
        dir_weight = 0
    else:
        dir_weight = 1
    return Fwy + dir_weight

In [277]:
freeway_vectors_update = create_freeway_vectors(
    no_dup_keep_last, [u'ID', u'Fwy', u'Dir', u'Abs_PM', u'Latitude', u'Longitude', u'Lanes', u'Name', 'Type'])

In [278]:
freeway_keys = sorted(freeway_vectors_update.keys(), key=sorted_func)
freeway_keys

['5_N',
 '5_S',
 '8_E',
 '8_W',
 '15_N',
 '15_S',
 '52_E',
 '52_W',
 '54_E',
 '54_W',
 '56_E',
 '56_W',
 '78_E',
 '78_W',
 '94_E',
 '94_W',
 '125_N',
 '125_S',
 '163_N',
 '163_S',
 '805_N',
 '805_S',
 '905_E',
 '905_W']

In [279]:
for ind, i in freeway_vectors_update['5_N'].iterrows():
    print i
    print i['ID']
    break

ID                        1114091
Fwy                             5
Dir                             N
Abs_PM                      0.057
Latitude                  32.5428
Longitude                 -117.03
Lanes                           6
Name         N/O CMNO DE LA PLAZA
Type                           ML
order                           0
Name: 730, dtype: object
1114091


# Create geojson using 2008 through 2015 data

In [235]:
# data for all districts if decide to upscale
# from pyspark.sql.functions import hour, mean,minute, stddev, count,max as psmax,min as psmin, date_format, \
#     split, explode

# from pyspark.sql import SQLContext
# from pyspark.sql import Row
# from pyspark.sql.types import *
# from pyspark.sql import DataFrameReader

In [236]:
# spark_df = spark.read.format("com.databricks.spark.csv").option("header", "true") \
#     .option("mode", "DROPMALFORMED") \
#     .load('../data/stats_2008_2015_d11.csv');

# spark_df.show()

In [237]:
df_new = pd.read_csv('../data/weekday_stats_2008_2015_d11.csv', usecols=range(1,5))
df_new.columns

Index([u'station', u'hour', u'minute', u'flow_mean'], dtype='object')

In [238]:
no_dup_keep_last.columns

Index([u'ID', u'Fwy', u'Dir', u'District', u'County', u'City', u'State_PM',
       u'Abs_PM', u'Latitude', u'Longitude', u'Length', u'Type', u'Lanes',
       u'Name', u'User_ID_1', u'User_ID_2', u'User_ID_3', u'User_ID_4',
       u'file_date'],
      dtype='object')

In [239]:
print no_dup_keep_last.Fwy.unique()

[ 94  78   5 805 125   8 163  15  52 905  56  54  67]


In [240]:
df_new.columns

Index([u'station', u'hour', u'minute', u'flow_mean'], dtype='object')

In [257]:
df_new['Time'] = pd.to_datetime(df_new['hour'].astype('str') + ':' + df_new['minute'].astype('str'),
                                format='%H:%M').dt.time

In [276]:
test = df_new[['station', 'Time', 'flow_mean']].as_matrix()
test

array([1108341, datetime.time(0, 0), 51.48754789270001], dtype=object)

In [250]:
# complete_with_meta = pd.merge(df_new, no_dup_keep_last[['ID', 'District', 'County', 'City', 'State_PM', 'Abs_PM',
#                                                       'Latitude', 'Longitude', 'Name', 'Lanes', 'Type', 'Fwy',
#                                                       'Dir']], how='left', left_on='station',
#                               right_on='ID')
# complete_with_meta['Time'] = pd.to_datetime(complete_with_meta['hour'].astype('str') + ':' + \
#                                             complete_with_meta['minute'].astype('str'),
#                                 format='%H:%M').dt.time

In [217]:
# average_day = []
# for key in freeway_keys:
#     Fwy, Dir = key.split('_')
#     tmp = complete_with_meta[(complete_with_meta.Fwy == int(Fwy)) & (complete_with_meta.Dir == Dir)]
#     average_day.append(tmp.groupby('ID')['flow_mean'].mean())
# df_avg = pd.concat(average_day)

In [218]:
# complete_with_meta_avg = pd.merge(pd.DataFrame(df_avg).reset_index(),
#                                   no_dup_keep_last[['ID', 'District', 'County', 'City', 'State_PM', 'Abs_PM',
#                                                     'Latitude', 'Longitude', 'Name', 'Lanes', 'Type', 'Fwy',
#                                                     'Dir']], how='left', left_on='ID',
#                               right_on='ID')

In [219]:
# complete_with_meta_avg.columns

Index([u'ID', u'flow_mean', u'District', u'County', u'City', u'State_PM',
       u'Abs_PM', u'Latitude', u'Longitude', u'Name', u'Lanes', u'Type',
       u'Fwy', u'Dir'],
      dtype='object')

In [251]:
complete_with_meta.Fwy.unique()

array([ 94,   8,   5, 805,  15,  78,  52, 163, 125,  56, 905,  54,  67])

In [282]:
df_new.columns

Index([u'station', u'hour', u'minute', u'flow_mean', u'Time'], dtype='object')

In [254]:
freeway_vectors_update_2008_2015['5_N'].count()

ID           38880
Fwy          38880
Dir          38880
Abs_PM       38880
Latitude     38880
Longitude    38880
Lanes        38880
Name         38880
Type         38880
flow_mean    38880
Time         38880
order        38880
dtype: int64

In [255]:
freeway_vectors_update_2008_2015['5_N'].columns

Index([u'ID', u'Fwy', u'Dir', u'Abs_PM', u'Latitude', u'Longitude', u'Lanes',
       u'Name', u'Type', u'flow_mean', u'Time', u'order'],
      dtype='object')

In [280]:
freeway_vectors_update[key].columns

Index([u'ID', u'Fwy', u'Dir', u'Abs_PM', u'Latitude', u'Longitude', u'Lanes',
       u'Name', u'Type', u'order'],
      dtype='object')

In [290]:
# example
temp = df_new[df_new.station == row['ID']][['Time', 'flow_mean']].sort_values(by='Time').set_index('Time')
print temp.head()
temp.as_matrix()[0:5]

          flow_mean
Time               
00:00:00  43.073009
00:05:00  45.417035
00:10:00  44.585177
00:15:00  44.297566
00:20:00  43.984513


array([[ 43.07300885],
       [ 45.4170354 ],
       [ 44.58517699],
       [ 44.29756637],
       [ 43.98451327]])

In [298]:
# update ML file for 2008 to 2015
final = {}

for key in freeway_keys:
    print key
    new_geojson = {'type': 'FeatureCollection', 'features': []}

    # freeway_vectors_update has all of the metadata info
    df = freeway_vectors_update[key]
    for idx, row in df.iterrows():
        properties = {'key': key,
                      'ID': row['ID'],
                      'Lanes': row['Lanes'],
                      'Name': row['Name'],
                      'Abs_PM': np.round(row['Abs_PM'], decimals=1),
                      'Order': row['order'],
                      'Type': row['Type'],
                     }
        flow_data = df_new[df_new.station == row['ID']][['Time', 'flow_mean']].sort_values(by='Time').set_index('Time')
        properties['Flow'] = flow_data.flow_mean.tolist()
        geometry = {'type': "Point", "coordinates": [row['Longitude'], row['Latitude']]}
        temp = {'type': 'Feature', 'properties': properties, "geometry": geometry}
        new_geojson['features'].append(temp)
#         break
    print "geojson len: %s" % len(new_geojson['features'])
    final[key] = {'visible': False, 'data': new_geojson}
#     print final
#     break
json.dump(final, open('../data/2015_to_2008_ML_d11_geojson_points2.json', 'w'))

5_N
geojson len: 135
5_S
geojson len: 119
8_E
geojson len: 49
8_W
geojson len: 49
15_N
geojson len: 87
15_S
geojson len: 84
52_E
geojson len: 27
52_W
geojson len: 29
54_E
geojson len: 3
54_W
geojson len: 3
56_E
geojson len: 17
56_W
geojson len: 14
78_E
geojson len: 19
78_W
geojson len: 26
94_E
geojson len: 16
94_W
geojson len: 23
125_N
geojson len: 36
125_S
geojson len: 38
163_N
geojson len: 15
163_S
geojson len: 17
805_N
geojson len: 73
805_S
geojson len: 76
905_E
geojson len: 11
905_W
geojson len: 13


## Calculate midpoint

source: http://www.movable-type.co.uk/scripts/latlong.html

var Bx = Math.cos(φ2) * Math.cos(λ2-λ1);

var By = Math.cos(φ2) * Math.sin(λ2-λ1);

var φ3 = Math.atan2(Math.sin(φ1) + Math.sin(φ2),
                    Math.sqrt( (Math.cos(φ1)+Bx)*(Math.cos(φ1)+Bx) + By*By ) );

var λ3 = λ1 + Math.atan2(By, Math.cos(φ1) + Bx);


In [343]:
# source:
# http://stackoverflow.com/questions/5895832/python-lat-long-midpoint-calculation-gives-wrong-result-when-longitude-90
import math

def midpoint(lat1, lon1, lat2, lon2, debug=False):
    if debug:
        print lat1, lon1
        print lat2, lon2
    lonA = math.radians(lon1)
    lonB = math.radians(lon2)
    latA = math.radians(lat1)
    latB = math.radians(lat2)

    dLon = lonB - lonA

    Bx = math.cos(latB) * math.cos(dLon)
    By = math.cos(latB) * math.sin(dLon)

    latC = math.atan2(math.sin(latA) + math.sin(latB),
                  math.sqrt((math.cos(latA) + Bx) * (math.cos(latA) + Bx) + By * By))
    lonC = lonA + math.atan2(By, math.cos(latA) + Bx)
    lonC = (lonC + 3 * math.pi) % (2 * math.pi) - math.pi

    return math.degrees(latC), math.degrees(lonC)

In [344]:
# test
midpoint(32.542842, -117.030331, 32.551690, -117.045725)

(32.54726623446039, -117.03802762069233)

In [346]:
# prototype
# shifted = freeway_vectors_update['5_N'].shift(-1)
# result = []
# final = []
# total = len(freeway_vectors_update['5_N'])
# print "total: %s" % total
# index = 0
# for idx, item in freeway_vectors_update['5_N'].iterrows():
# #     print item['order']
# #     print index
#     if item['order'] != (total - 1):
#         result.append(midpoint(item['Latitude'], item['Longitude'], shifted.iloc[index]['Latitude'],
#                                shifted.iloc[index]['Longitude']))
#         final.append([item['Latitude'], item['Longitude'], result[index][0], result[index][1]])
#     else:
#         final.append([result[index-1][0], result[index-1][1], item['Latitude'], item['Longitude']])
#     index += 1


total: 135


In [347]:
# example format for geojson for the line
{ 
    "type": "Feature",
    "properties":
    {
        "id": 2,
        "elevation": 50
    },
    "geometry":
    {
        "type": "LineString",
        "coordinates": 
        [
            [ 11.836395263671875, 47.75317468890147 ],
            [ 11.865234375, 47.73193447949174 ]
        ]
    }
}

{'geometry': {'coordinates': [[11.836395263671875, 47.75317468890147],
   [11.865234375, 47.73193447949174]],
  'type': 'LineString'},
 'properties': {'elevation': 50, 'id': 2},
 'type': 'Feature'}

In [None]:
[ 
{ "type": "Feature", "properties": { "id": 2, "elevation": 50 }, "geometry": { "type": "LineString", "coordinates": [ [ 11.836395263671875, 47.75317468890147 ], [ 11.865234375, 47.73193447949174 ] ] } },
{ "type": "Feature", "properties": { "id": 1, "elevation": 750 }, "geometry": { "type": "LineString", "coordinates": [ [ 11.865234375,47.73193447949174 ], [ 11.881027221679688, 47.700520033704954 ] ] } },
{ "type": "Feature", "properties": { "id": 0, "elevation": 1700 }, "geometry": { "type": "LineString", "coordinates": [ [ 11.881027221679688, 47.700520033704954 ], [ 11.923599243164062, 47.706527200903395 ] ] } },
{ "type": "Feature", "properties": { "id": 0, "elevation": 3000 }, "geometry": { "type": "LineString", "coordinates": [ [ 11.923599243164062, 47.706527200903395 ], [ 11.881027221679688, 47.700520033704954 ], ] } }
]

In [475]:
def calculate_segments(freeway_df, wig_dat):
    """
    This function will calculate the segments from the ordered stations and return
    an array of midpoints
    """
    shifted = freeway_df.shift(-1)
    result = []
    final = []
    data = []
    stations = []
    total = len(freeway_df)
    print "total: %s" % total
    index = 0
    for idx, item in freeway_df.iterrows():
    #     print item['order']
    #     print index
#         print item
        station = item['ID']
        stations.append(station)
        points = wig_dat[wig_dat.ID == station].reset_index().T.iloc[2:1442].T.ix[0].tolist()
#         print points
        data.append(points)
        if item['order'] == 0:
            result.append(midpoint(item['Latitude'], item['Longitude'], shifted.iloc[index]['Latitude'],
                                   shifted.iloc[index]['Longitude']))
            final.append([[item['Longitude'], item['Latitude']], [result[index][1], result[index][0]]])            
        elif item['order'] != (total - 1):
            result.append(midpoint(item['Latitude'], item['Longitude'], shifted.iloc[index]['Latitude'],
                                   shifted.iloc[index]['Longitude']))
            final.append([[result[index-1][1], result[index-1][0]], [result[index][1], result[index][0]]])
        else:
            final.append([[result[index-1][1], result[index-1][0]], [item['Longitude'], item['Latitude']]])
        index += 1
    return final, data, stations

In [476]:
example = pd.read_csv('../vis/WiggleVis/data/heatmaps/wiggle_analysis_%s_%s.csv' % (5, 'N'))
# print example.head()
segments, wiggles, stations = calculate_segments(freeway_vectors_update['5_N'], example)

total: 135


In [477]:
# update ML file for 2008 to 2015
final = {}

for key in freeway_keys:
    print key
    Fwy, Dir = key.split('_')
    wiggle_data = pd.read_csv('../vis/WiggleVis/data/heatmaps/wiggle_analysis_%s_%s.csv' % (Fwy, Dir))
    segments, wiggles, stations = calculate_segments(freeway_vectors_update[key], wiggle_data)

    data_to_store = {'type': "FeatureCollection", 'features': []}
    for seg, wig, stat in zip(segments, wiggles, stations):
        new_geojson = {'type': 'Feature'}
        properties = {'wiggles': wig, 'ID': stat}
        geometry = {'type': "LineString", "coordinates": seg}
        new_geojson['geometry'] = geometry
        new_geojson['properties'] = properties
#         print new_geojson
        data_to_store['features'].append(new_geojson)
    final[key] = data_to_store
#     print final
# print final
json_string = json.dumps(final)
final_string = 'var segment_data = ' + json_string
with open('../data/2015_to_2008_ML_d11_geojson_lines.js', 'w') as outfile:
    outfile.write(final_string)

5_N
total: 135
5_S
total: 119
8_E
total: 49
8_W
total: 49
15_N
total: 87
15_S
total: 84
52_E
total: 27
52_W
total: 29
54_E
total: 3
54_W
total: 3
56_E
total: 17
56_W
total: 14
78_E
total: 19
78_W
total: 26
94_E
total: 16
94_W
total: 23
125_N
total: 36
125_S
total: 38
163_N
total: 15
163_S
total: 17
805_N
total: 73
805_S
total: 76
905_E
total: 11
905_W
total: 13


In [470]:
final['125_S'][0]

{'geometry': {'coordinates': [[32.620053000000006, -116.97154099999999],
   [32.620053000000006, -116.97154099999996]],
  'type': 'LineString'},
 'properties': {'ID': 1119067,
  'wiggles': [-1.0,
   0.057515290842,
   0.26753338188000003,
   0.4659019436,
   0.643183579758,
   0.79104435687,
   0.902698912439,
   0.973263966957,
   1.0,
   0.9824281104070001,
   0.922317067161,
   0.8235437145100001,
   0.691837666898,
   0.534428103826,
   0.359616017343,
   0.17629914848499997,
   -0.00942421576635,
   -0.26045302519200003,
   -0.48702655272600004,
   -0.679604742244,
   -0.830812640189,
   -0.935731129271,
   -0.992038250494,
   -1.0,
   -0.962318471734,
   -0.883853255194,
   -0.7712386398190001,
   -0.63242409206,
   -0.476168456997,
   -0.311519327251,
   -0.14730808068,
   0.017659649508700002,
   0.31500368626200004,
   0.566935107575,
   0.764725240124,
   0.903257640584,
   0.981038196246,
   1.0,
   0.96512767379,
   0.8839347169239999,
   0.765833860174,
   0.621444097026,
