In [1]:
from pyspark import SparkContext
sc = SparkContext("local", "Simple App")

sc

## Initialisieren des SQLContext Objektes und laden der Uber Daten

In [3]:
sqlContext = SQLContext(sc)

In [4]:
df = sqlContext.read \
    .format('com.databricks.spark.csv') \
    .options(header='true', inferschema='true') \
    .load('data/Uber-Jan-Feb-FOIL.csv')

df

DataFrame[dispatching_base_number: string, date: string, active_vehicles: int, trips: int]

## Ausgabe des automatisch generierten Schemas

In [6]:
df.printSchema()

root
 |-- dispatching_base_number: string (nullable = true)
 |-- date: string (nullable = true)
 |-- active_vehicles: integer (nullable = true)
 |-- trips: integer (nullable = true)



## Mit DataFrames arbeiten

In [78]:
df.corr(col1= 'active_vehicles', col2 = 'trips')

0.9804925397246566

In [79]:
df.approxQuantile(['active_vehicles', 'trips'], [0.01, 0.5, 0.99], 0.05)

[[112.0, 1072.0, 4395.0], [629.0, 9546.0, 45858.0]]

In [82]:
df.groupBy('dispatching_base_number').max('active_vehicles').show()

+-----------------------+--------------------+
|dispatching_base_number|max(active_vehicles)|
+-----------------------+--------------------+
|                 B02512|                 281|
|                 B02598|                1216|
|                 B02682|                1523|
|                 B02765|                 786|
|                 B02617|                1590|
|                 B02764|                4395|
+-----------------------+--------------------+



In [74]:
df.select("dispatching_base_number").distinct().show()

+-----------------------+
|dispatching_base_number|
+-----------------------+
|                 B02512|
|                 B02598|
|                 B02682|
|                 B02765|
|                 B02617|
|                 B02764|
+-----------------------+



In [89]:
data_per_day = df.groupBy('date').sum('active_vehicles', 'trips')
data_per_day.orderBy('date').show()

+---------+--------------------+----------+
|     date|sum(active_vehicles)|sum(trips)|
+---------+--------------------+----------+
| 1/1/2015|                6885|     56437|
|1/10/2015|                7346|     78484|
|1/11/2015|                6571|     57026|
|1/12/2015|                7364|     57646|
|1/13/2015|                7559|     63499|
|1/14/2015|                7849|     63329|
|1/15/2015|                8080|     66729|
|1/16/2015|                8273|     72480|
|1/17/2015|                7527|     73363|
|1/18/2015|                6863|     72473|
|1/19/2015|                5945|     45755|
| 1/2/2015|                6330|     39189|
|1/20/2015|                7592|     54772|
|1/21/2015|                7948|     59147|
|1/22/2015|                8267|     65602|
|1/23/2015|                8490|     71819|
|1/24/2015|                7643|     80709|
|1/25/2015|                6787|     57895|
|1/26/2015|                6533|     43395|
|1/27/2015|                3496|

In [90]:
data_per_day.approxQuantile(['sum(active_vehicles)', 'sum(trips)'], [0.01, 0.5, 0.99], 0.05)

[[3496.0, 7999.0, 9649.0], [25244.0, 70296.0, 100915.0]]

## Mit SparkSQL arbeiten

In [7]:
df.registerTempTable("uber")

In [70]:
distinct_bases = sqlContext.sql("select distinct dispatching_base_number from uber")
distinct_bases.show()

+-----------------------+
|dispatching_base_number|
+-----------------------+
|                 B02512|
|                 B02598|
|                 B02682|
|                 B02765|
|                 B02617|
|                 B02764|
+-----------------------+



In [31]:
as_df = sqlContext.sql("select * from uber").toDF('dispatching_base_number', 'date', 'active_vehicles', 'trips')

# as_df.groupBy('dispatching_base_number')

In [76]:
as_df.sample(fraction = 0.05, withReplacement = False).show()

+-----------------------+---------+---------------+-----+
|dispatching_base_number|     date|active_vehicles|trips|
+-----------------------+---------+---------------+-----+
|                 B02598| 1/2/2015|            785| 4768|
|                 B02598| 1/6/2015|            933| 6816|
|                 B02512| 1/8/2015|            238| 1772|
|                 B02598|1/10/2015|            949|10287|
|                 B02598|1/28/2015|           1011| 8071|
|                 B02682|1/29/2015|           1316|11485|
|                 B02512|1/30/2015|            256| 2016|
|                 B02682| 2/2/2015|           1152|11981|
|                 B02765| 2/2/2015|            275| 2607|
|                 B02512| 2/2/2015|            227| 1904|
|                 B02682| 2/7/2015|           1300|13450|
|                 B02512|2/11/2015|            255| 1831|
|                 B02598|2/25/2015|           1076| 9405|
|                 B02682|2/27/2015|           1510|14975|
+-------------

In [11]:
sqlContext.sql("""select dispatching_base_number, sum(`trips`) as cnt 
                                from uber group by `dispatching_base_number` 
                                order by cnt desc""").show()

+-----------------------+-------+
|dispatching_base_number|    cnt|
+-----------------------+-------+
|                 B02764|1914449|
|                 B02617| 725025|
|                 B02682| 662509|
|                 B02598| 540791|
|                 B02765| 193670|
|                 B02512|  93786|
+-----------------------+-------+



In [12]:
sqlContext.sql("""select `date`, sum(`trips`) as cnt 
                    from uber 
                    group by `date` 
                    order by cnt desc 
                    limit 5""").show()

+---------+------+
|     date|   cnt|
+---------+------+
|2/20/2015|100915|
|2/14/2015|100345|
|2/21/2015| 98380|
|2/13/2015| 98024|
|1/31/2015| 92257|
+---------+------+



In [None]:
sqlContext.sql("""select `date`, sum(`trips`) as cnt 
                    from uber 
                    group by `date` 
                    order by cnt desc 
                    limit 5""").show()

In [None]:
sc.stop()