In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, DateType
from pyspark.sql.functions import expr,count
import time, random

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("estaciones_viajes_notebook") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "hdfs:///spark/logs/history") \
    .config("spark.history.fs.logDirectory", "hdfs:///spark/logs/history") \
    .getOrCreate()

sc = spark.sparkContext

In [37]:
station_schema = StructType([
    StructField("station_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("long", DoubleType(), True),
    StructField("dockcount", IntegerType(), True),
    StructField("landmark", StringType(), True),
    StructField("installation", DateType(), True)
])

trip_schema = StructType([
   StructField("Trip Id", IntegerType(), True),
   StructField("Duration", IntegerType(), True),
   StructField("Start Date", DateType(), True),
   StructField("Start Station", StringType(), True),
   StructField("Start Terminal", IntegerType(), True),
   StructField("End Date", DateType(), True),
   StructField("End Station", StringType(), True),
   StructField("End Terminal", IntegerType(), True),
   StructField("Bike", IntegerType(), True),
   StructField("Subscriber Type", StringType(), True),
   StructField("Zip Code", StringType(), True)
])

In [38]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

stations_df = spark.read.csv("/user/jovyan/data/bike-data/201508_station_data.csv", header=True, schema=station_schema,dateFormat="M/D/yyyy")
trips_df = spark.read.csv("/user/jovyan/data/bike-data/201508_trip_data.csv", header=True, schema=trip_schema,dateFormat="M/D/yyyy")

stations_df.show(10)
trips_df.show(10)

+----------+--------------------+---------+-----------+---------+--------+------------+
|station_id|                name|      lat|       long|dockcount|landmark|installation|
+----------+--------------------+---------+-----------+---------+--------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|San Jose|  2013-01-06|
|         3|San Jose Civic Ce...|37.330698|-121.888979|       15|San Jose|  2013-01-05|
|         4|Santa Clara at Al...|37.333988|-121.894902|       11|San Jose|  2013-01-06|
|         5|    Adobe on Almaden|37.331415|  -121.8932|       19|San Jose|  2013-01-05|
|         6|    San Pedro Square|37.336721|-121.894074|       15|San Jose|  2013-01-07|
|         7|Paseo de San Antonio|37.333798|-121.886943|       15|San Jose|  2013-01-07|
|         8| San Salvador at 1st|37.330165|-121.885831|       15|San Jose|  2013-01-05|
|         9|           Japantown|37.348742|-121.894715|       15|San Jose|  2013-01-05|
|        10|  San Jose City Hall

In [26]:
# Contar cuántos viajes empiezan en cada estación
start_counts = trips_df.groupBy("Start Station").agg(count("*").alias("Numero_viajes_empiezan"))

# Contar cuántos viajes terminan en cada estación
end_counts = trips_df.groupBy("End Station").agg(count("*").alias("Numero_viajes_acaban"))

In [34]:
aux_df = stations_df.join(start_counts, stations_df["name"] == start_counts["Start Station"], "left")
aux_df.show()
join_df = aux_df.join(end_counts, stations_df["name"] == end_counts["End Station"], "left")
join_df.show()
join_df.printSchema()
result_df = join_df.selectExpr("name as Estacion", "lat as Latitud", "long as Longitud", "Numero_viajes_empiezan", "Numero_viajes_acaban")
result_df.show()

                                                                                

+----------+--------------------+---------+-----------+---------+-------------+------------+--------------------+----------------------+
|station_id|                name|      lat|       long|dockcount|     landmark|installation|       Start Station|Numero_viajes_empiezan|
+----------+--------------------+---------+-----------+---------+-------------+------------+--------------------+----------------------+
|        11|         MLK Library|37.335885| -121.88566|       19|     San Jose|  2013-01-06|         MLK Library|                  1099|
|         9|           Japantown|37.348742|-121.894715|       15|     San Jose|  2013-01-05|           Japantown|                   885|
|         7|Paseo de San Antonio|37.333798|-121.886943|       15|     San Jose|  2013-01-07|Paseo de San Antonio|                   856|
|         8| San Salvador at 1st|37.330165|-121.885831|       15|     San Jose|  2013-01-05| San Salvador at 1st|                   495|
|         2|San Jose Diridon ...|37.32973

                                                                                

+----------+--------------------+---------+-----------+---------+-------------+------------+--------------------+----------------------+--------------------+--------------------+
|station_id|                name|      lat|       long|dockcount|     landmark|installation|       Start Station|Numero_viajes_empiezan|         End Station|Numero_viajes_acaban|
+----------+--------------------+---------+-----------+---------+-------------+------------+--------------------+----------------------+--------------------+--------------------+
|        11|         MLK Library|37.335885| -121.88566|       19|     San Jose|  2013-01-06|         MLK Library|                  1099|         MLK Library|                 960|
|         9|           Japantown|37.348742|-121.894715|       15|     San Jose|  2013-01-05|           Japantown|                   885|           Japantown|                 976|
|         7|Paseo de San Antonio|37.333798|-121.886943|       15|     San Jose|  2013-01-07|Paseo de San 

                                                                                

+--------------------+---------+-----------+----------------------+--------------------+
|            Estacion|  Latitud|   Longitud|Numero_viajes_empiezan|Numero_viajes_acaban|
+--------------------+---------+-----------+----------------------+--------------------+
|         MLK Library|37.335885| -121.88566|                  1099|                 960|
|           Japantown|37.348742|-121.894715|                   885|                 976|
|Paseo de San Antonio|37.333798|-121.886943|                   856|                1073|
| San Salvador at 1st|37.330165|-121.885831|                   495|                 547|
|San Jose Diridon ...|37.329732|-121.901782|                  4968|                5045|
|San Jose Civic Ce...|37.330698|-121.888979|                   774|                 909|
|SJSU - San Salvad...|37.333955|-121.877349|                   494|                 445|
|Mountain View Cit...|37.389218|-122.081896|                  1583|                1724|
|Redwood City Calt...

In [35]:
result_df.write.mode("overwrite").csv("/user/jovyan/data/salida245", header=True)

                                                                                