In [38]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [39]:
parking = sqlContext.read.json("../data/sf_parking/sf_parking_clean.json")

In [40]:
parking.printSchema()

root
 |-- address: string (nullable = true)
 |-- garorlot: string (nullable = true)
 |-- landusetyp: string (nullable = true)
 |-- location_1: struct (nullable = true)
 |    |-- latitude: string (nullable = true)
 |    |-- longitude: string (nullable = true)
 |    |-- needs_recoding: boolean (nullable = true)
 |-- mccap: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- primetype: string (nullable = true)
 |-- regcap: string (nullable = true)
 |-- secondtype: string (nullable = true)
 |-- valetcap: string (nullable = true)



In [41]:
parking.first()

Row(address=u'2110 Market St', garorlot=u'L', landusetyp=u'restaurant', location_1=Row(latitude=u'37.767378', longitude=u'-122.429344', needs_recoding=False), mccap=u'0', owner=u'Private', primetype=u'PPA', regcap=u'13', secondtype=u' ', valetcap=u'0')

In [42]:
parking.registerTempTable("parking")

In [43]:
parking.show()

+--------------------+--------+----------+--------------------+-----+----------+---------+------+----------+--------+
|             address|garorlot|landusetyp|          location_1|mccap|     owner|primetype|regcap|secondtype|valetcap|
+--------------------+--------+----------+--------------------+-----+----------+---------+------+----------+--------+
|      2110 Market St|       L|restaurant|[37.767378,-122.4...|    0|   Private|      PPA|    13|          |       0|
|         993 Potrero|       L|          |[37.757272,-122.4...|    0|     SFMTA|      PPA|    34|          |       0|
|601 Terry A Franc...|       L|          |[37.770135,-122.3...|    0|Port of SF|      PPA|    72|          |       0|
|   11 SOUTH VAN NESS|       G|          |[37.77415,-122.41...|    0|   Private|      PHO|   130|       CPO|       0|
|   101 CALIFORNIA ST|       G|          |[37.793243,-122.3...|    0|   Private|      PPA|   250|          |       0|
|        2000 POST ST|       G|          |[37.785078,-12

In [44]:
aggr_by_type = sqlContext.sql("SELECT primetype, secondtype, count(1) AS count, round(avg(regcap), 0) AS avg_spaces " +
                              "FROM parking " +
                              "GROUP BY primetype, secondtype " +
                              "HAVING trim(primetype) != '' " +
                              "ORDER BY count DESC")

In [45]:
aggr_by_type.show()

+---------+----------+-----+----------+
|primetype|secondtype|count|avg_spaces|
+---------+----------+-----+----------+
|      PPA|          |  462|     210.0|
|      PHO|          |  300|      69.0|
|      CPO|          |  163|      53.0|
|      CGO|          |   49|     135.0|
|      PPA|       PHO|   19|     178.0|
|      PPA|       CPO|    2|     263.0|
|      PHO|       CPO|    1|     130.0|
|      PPA|       RPO|    1|      87.0|
|      CPO|       PPA|    1|      12.0|
+---------+----------+-----+----------+



In [46]:
from pyspark.sql import functions as F

aggr_by_type = parking.select("primetype", "secondtype", "regcap") \
                     .where("trim(primetype) != ''") \
                     .groupBy("primetype", "secondtype") \
                     .agg(
                        F.count("*").alias("count"),
                        F.round(F.avg("regcap"), 0).alias("avg_spaces")
                      ).sort("count", ascending=False)
                     

In [47]:
aggr_by_type.show()

+---------+----------+-----+----------+
|primetype|secondtype|count|avg_spaces|
+---------+----------+-----+----------+
|      PPA|          |  462|     210.0|
|      PHO|          |  300|      69.0|
|      CPO|          |  163|      53.0|
|      CGO|          |   49|     135.0|
|      PPA|       PHO|   19|     178.0|
|      PPA|       CPO|    2|     263.0|
|      PHO|       CPO|    1|     130.0|
|      PPA|       RPO|    1|      87.0|
|      CPO|       PPA|    1|      12.0|
+---------+----------+-----+----------+



In [48]:
parking.describe("regcap", "valetcap", "mccap").show()

+-------+------------------+------------------+------------------+
|summary|            regcap|          valetcap|             mccap|
+-------+------------------+------------------+------------------+
|  count|              1000|              1000|              1000|
|   mean|           137.294|             3.297|             0.184|
| stddev|361.05120902655824|22.624824279398823|1.9015151221485882|
|    min|                 0|                 0|                 0|
|    max|               998|                96|                 8|
+-------+------------------+------------------+------------------+



In [49]:
parking.stat.crosstab("owner", "primetype").show()

+-------------------+---+---+---+---+---+
|    owner_primetype|PPA|PHO|CPO|CGO|   |
+-------------------+---+---+---+---+---+
|         Port of SF|  7|  7|  0|  4|  0|
|               SFPD|  0|  3|  0|  6|  0|
|              SFMTA| 42| 14|  0|  0|  0|
|GG Bridge Authority|  2|  0|  0|  0|  0|
|               SFSU|  2|  6|  0|  0|  0|
|               SFRA|  2|  0|  0|  0|  0|
|                LHH|  0|  5|  0|  0|  0|
|                DMV|  0|  0|  1|  0|  0|
|           Caltrans|  0|  0|  0|  1|  0|
|           Presidio|  5|  1|  1|  2|  0|
|              SFPUC|  0|  0|  0|  5|  0|
|       City College|  0|  7|  0|  0|  0|
|                 UC|  1|  0|  0|  0|  0|
|                RPD|  3|  1|  0|  4|  0|
|              SFUSD|  0|  5|  0|  0|  0|
|        State of CA|  0|  1|  0|  0|  0|
|               USPS|  0|  0|  0|  4|  0|
|            Customs|  0|  0|  0|  1|  0|
|              GGNRA|  1|  0|  1|  0|  0|
|               UCSF| 13|  8|  0|  0|  0|
+-------------------+---+---+---+-

In [50]:
parking = parking.withColumnRenamed('regcap', 'regcap_old')
parking = parking.withColumn('regcap', parking['regcap_old'].cast('int'))
parking = parking.drop('regcap_old')
parking.printSchema()

root
 |-- address: string (nullable = true)
 |-- garorlot: string (nullable = true)
 |-- landusetyp: string (nullable = true)
 |-- location_1: struct (nullable = true)
 |    |-- latitude: string (nullable = true)
 |    |-- longitude: string (nullable = true)
 |    |-- needs_recoding: boolean (nullable = true)
 |-- mccap: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- primetype: string (nullable = true)
 |-- secondtype: string (nullable = true)
 |-- valetcap: string (nullable = true)
 |-- regcap: integer (nullable = true)



In [51]:
def convert_column(df, col, new_type):
    old_col = '%s_old' % col
    df = df.withColumnRenamed(col, old_col)
    df = df.withColumn(col, df[old_col].cast(new_type))
    df.drop(old_col)
    return df

In [52]:
parking = convert_column(parking, 'valetcap', 'int')
parking = convert_column(parking, 'mccap', 'int')
parking.printSchema()

root
 |-- address: string (nullable = true)
 |-- garorlot: string (nullable = true)
 |-- landusetyp: string (nullable = true)
 |-- location_1: struct (nullable = true)
 |    |-- latitude: string (nullable = true)
 |    |-- longitude: string (nullable = true)
 |    |-- needs_recoding: boolean (nullable = true)
 |-- mccap_old: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- primetype: string (nullable = true)
 |-- secondtype: string (nullable = true)
 |-- valetcap_old: string (nullable = true)
 |-- regcap: integer (nullable = true)
 |-- valetcap: integer (nullable = true)
 |-- mccap: integer (nullable = true)



In [79]:
import requests

def to_neighborhood(location):
    """
    Uses Google's Geocoding API to perform a reverse-lookup on latitude and longitude
    https://developers.google.com/maps/documentation/geocoding/intro#reverse-example
    """
    name = 'N/A'
    lat = location.latitude
    long = location.longitude
    
    r = requests.get('https://maps.googleapis.com/maps/api/geocode/json?latlng=%s,%s' % (lat, long))
    if r.status_code == 200:
        content = r.json()
        places = content['results']  # results is a list of matching places
        neighborhoods = [p['formatted_address'] for p in places if 'neighborhood' in p['types']]
        if neighborhoods:
            # Addresses are formatted as Japantown, San Francisco, CA
            # so split on comma and just return neighborhood name
            name = neighborhoods[0].split(',')[0]
    
    return name

In [80]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
    
location_to_neighborhood=udf(to_neighborhood, StringType())

parking.filter(parking.owner == 'SFMTA') \
       .select("location_1", "primetype", "garorlot", "regcap") \
       .withColumn("location_1", location_to_neighborhood("location_1")).show()

+------------------+---------+--------+------+
|        location_1|primetype|garorlot|regcap|
+------------------+---------+--------+------+
|  Mission District|      PPA|       L|    34|
|     Westwood Park|      PPA|       L|    20|
|        Tenderloin|      PPA|       G|   925|
|      Outer Sunset|      PPA|       L|    24|
|      Inner Sunset|      PPA|       L|    41|
|      Inner Sunset|      PPA|       L|    36|
|        Polk Gulch|      PPA|       G|   129|
|         Japantown|      PPA|       G|   747|
|   South of Market|      PHO|       G|    59|
|         Japantown|      PPA|       G|   177|
|       West Portal|      PPA|       L|    19|
|   Mission Terrace|      PPA|       L|    28|
|        Cow Hollow|      PPA|       G|   205|
|   Pacific Heights|      PPA|       L|    48|
|           Portola|      PPA|       L|    10|
|Financial District|      PPA|       G|  1095|
|  Mission District|      PPA|       G|   807|
|          Lakeside|      PPA|       L|    20|
|  Mission Di