In [1]:
spark.conf.set("fs.azure.account.key.bikedatacabattag.dfs.core.windows.net", dbutils.secrets.get(scope = "bikedata", key = "storagekey"))
spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true")
dbutils.fs.ls("abfss://bike@bikedatacabattag.dfs.core.windows.net/")
spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "false")

In [2]:
configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": "f989fd61-edef-44fc-9980-a3643b2eaa54",
           "fs.azure.account.oauth2.client.secret": dbutils.secrets.get(scope = "bikedata", key = "oauthkey"),
           "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/72f988bf-86f1-41af-91ab-2d7cd011db47/oauth2/token"}

# Optionally, you can add <your-directory-name> to the source URI of your mount point.
dbutils.fs.mount(
  source = "abfss://bike@bikedatacabattag.dfs.core.windows.net/",
  mount_point = "/mnt/bike",
  extra_configs = configs)

In [3]:
display(dbutils.fs.ls("/mnt/bike/"))

path,name,size
dbfs:/mnt/bike/201306-citibike-tripdata.csv,201306-citibike-tripdata.csv,100938999
dbfs:/mnt/bike/201307-citibike-tripdata.csv,201307-citibike-tripdata.csv,164438561
dbfs:/mnt/bike/201308-citibike-tripdata.csv,201308-citibike-tripdata.csv,195523200
dbfs:/mnt/bike/201309-citibike-tripdata.csv,201309-citibike-tripdata.csv,201965642
dbfs:/mnt/bike/201310-citibike-tripdata.csv,201310-citibike-tripdata.csv,202728202
dbfs:/mnt/bike/201311-citibike-tripdata.csv,201311-citibike-tripdata.csv,131891356
dbfs:/mnt/bike/201312-citibike-tripdata.csv,201312-citibike-tripdata.csv,86622375
dbfs:/mnt/bike/201401-citibike-tripdata.csv,201401-citibike-tripdata.csv,58633836
dbfs:/mnt/bike/201402-citibike-tripdata.csv,201402-citibike-tripdata.csv,43899524
dbfs:/mnt/bike/201403-citibike-tripdata.csv,201403-citibike-tripdata.csv,85757372


In [4]:
from pyspark.sql.types import *

schema = StructType([
    StructField("tripduration", IntegerType()),
    StructField("starttime", DateType()),
    StructField("stoptime", DateType()),
    StructField("start station id", IntegerType()),
    StructField("start station name", StringType()),
    StructField("start station latitude", DecimalType(11, 9)),
    StructField("start station longitude", DecimalType(11, 9)),
    StructField("end station id", IntegerType()),
    StructField("end station name", StringType()),
    StructField("end station latitude", DecimalType(11, 9)),
    StructField("end station longitude", DecimalType(11, 9)),
    StructField("bikeid", IntegerType()),
    StructField("usertype", StringType()),
    StructField("birth year", IntegerType()),
    StructField("gender", IntegerType())
])

bike = sqlContext.read.format('csv').options(header='true', nullValue='\\N').schema(schema).load('/mnt/bike/')

display(bike)

tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
634,2013-07-01,2013-07-01,164,E 47 St & 2 Ave,40.75323098,-73.97032517,504,1 Ave & E 15 St,40.73221853,-73.98165557,16950,Customer,,0
1547,2013-07-01,2013-07-01,388,W 26 St & 10 Ave,40.749717753,-74.002950346,459,W 20 St & 11 Ave,40.746745,-74.007756,19816,Customer,,0
178,2013-07-01,2013-07-01,293,Lafayette St & E 8 St,40.73028666,-73.9907647,237,E 11 St & 2 Ave,40.73047309,-73.98672378,14548,Subscriber,1980.0,2
1580,2013-07-01,2013-07-01,531,Forsyth St & Broome St,40.71893904,-73.99266288,499,Broadway & W 60 St,40.76915505,-73.98191841,16063,Customer,,0
757,2013-07-01,2013-07-01,382,University Pl & E 14 St,40.73492695,-73.99200509,410,Suffolk St & Stanton St,40.72066442,-73.98517977,19213,Subscriber,1986.0,1
861,2013-07-01,2013-07-01,511,E 14 St & Avenue B,40.72938685,-73.97772429,454,E 51 St & 1 Ave,40.75455731,-73.96592976,16223,Subscriber,1988.0,1
550,2013-07-01,2013-07-01,293,Lafayette St & E 8 St,40.73028666,-73.9907647,394,E 9 St & Avenue C,40.72521311,-73.97768752,16746,Customer,,0
288,2013-07-01,2013-07-01,224,Spruce St & Nassau St,40.71146364,-74.00552427,376,John St & William St,40.70862144,-74.00722156,16062,Subscriber,1985.0,2
766,2013-07-01,2013-07-01,432,E 7 St & Avenue A,40.72621788,-73.98379855,336,Sullivan St & Washington Sq,40.73047747,-73.99906065,17963,Subscriber,1980.0,2
773,2013-07-01,2013-07-01,173,Broadway & W 49 St,40.76064679,-73.98442659,479,9 Ave & W 45 St,40.76019252,-73.9912551,19365,Subscriber,1989.0,1


In [5]:
from pyspark.sql.functions import mean, min, max

bike.select([mean('tripduration'), min('tripduration'), max('tripduration')]).show()

In [6]:
# Stolen blatentnly from https://stackoverflow.com/a/51633111/404006 
from pyspark import *
from pyspark.sql import functions as F
# Get people that are only 100 years old or younger and have specified their gender
filteredbike = bike.where((F.col("birth year") >= 1919) & (F.col("gender") > 0))
aggregatebike = filteredbike.groupBy("birth year", "gender").agg(F.mean("tripduration"), F.count("tripduration")).sort("birth year")
display(aggregatebike)

birth year,gender,avg(tripduration),count(tripduration)
1919,2,1703.76,75
1920,1,921.0215827338128,139
1920,2,1948.25,4
1921,1,377.5046979865772,745
1921,2,625.3592233009708,103
1922,1,610.0185950413223,484
1923,2,1324.7314814814815,108
1923,1,976.3149779735684,2270
1924,2,639.0823529411765,85
1924,1,490.8333333333333,456
