In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types
from pyspark.sql import functions as F
import pandas as pd
import sys
import os

In [3]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [5]:
schema = types.StructType([
    types.StructField('Affiliated_base_number', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [6]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('../data/fhv_tripdata_2019-10_new.csv.gz')

In [7]:
df.count()

1897493

In [8]:
df.show(10)

+----------------------+--------------------+-------------------+----------------+------------+------------+---------------+
|Affiliated_base_number|dispatching_base_num|    pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|        SR_Flag|
+----------------------+--------------------+-------------------+----------------+------------+------------+---------------+
|                B00009| 2019-10-01 00:23:00|2019-10-01 00:35:00|            null|         264|        null|         B00009|
|                B00013| 2019-10-01 00:11:29|2019-10-01 00:13:22|            null|         264|        null|         B00013|
|                B00014| 2019-10-01 00:11:43|2019-10-01 00:37:20|            null|         264|        null|         B00014|
|                B00014| 2019-10-01 00:56:29|2019-10-01 00:57:47|            null|         264|        null|         B00014|
|                B00014| 2019-10-01 00:23:09|2019-10-01 00:28:27|            null|         264|        null|         B00014|


In [9]:
df \
    .withColumn('pickup_date', F.to_date('pickup_datetime')) \
    .filter(F.col("pickup_date") == "2019-10-15") \
    .count()

62295

____

In [10]:
df_pandas = pd.read_csv('../data/fhv_tripdata_2019-10.csv')

In [11]:
df_pandas.shape

(1897493, 7)

In [12]:
assert df_pandas.shape[0] == df.count()

In [13]:
df_pandas.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2019-10-01 00:23:00,2019-10-01 00:35:00,264.0,264.0,,B00009
1,B00013,2019-10-01 00:11:29,2019-10-01 00:13:22,264.0,264.0,,B00013
2,B00014,2019-10-01 00:11:43,2019-10-01 00:37:20,264.0,264.0,,B00014
3,B00014,2019-10-01 00:56:29,2019-10-01 00:57:47,264.0,264.0,,B00014
4,B00014,2019-10-01 00:23:09,2019-10-01 00:28:27,264.0,264.0,,B00014


In [14]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1897493 entries, 0 to 1897492
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   dispatching_base_num    object 
 1   pickup_datetime         object 
 2   dropOff_datetime        object 
 3   PUlocationID            float64
 4   DOlocationID            float64
 5   SR_Flag                 float64
 6   Affiliated_base_number  object 
dtypes: float64(3), object(4)
memory usage: 101.3+ MB


In [15]:
df_pandas['pickup_datetime'] = pd.to_datetime(df_pandas['pickup_datetime'])
df_pandas['pickup_date'] = df_pandas['pickup_datetime'].dt.date.astype(str)

In [16]:
len(df_pandas[df_pandas['pickup_date'] == '2019-10-15'])

62610

----

In [17]:
df_pandas[['dispatching_base_num', 'pickup_date']].groupby(['pickup_date']).count()

Unnamed: 0_level_0,dispatching_base_num
pickup_date,Unnamed: 1_level_1
2019-10-01,59873
2019-10-02,68746
2019-10-03,71638
2019-10-04,68227
2019-10-05,52398
2019-10-06,45665
2019-10-07,66137
2019-10-08,64049
2019-10-09,60468
2019-10-10,68559


In [18]:
trip_counts_by_date = df \
                        .withColumn('pickup_date', F.to_date('pickup_datetime')) \
                        .groupBy("pickup_date").count().orderBy("pickup_date")

# Show the results
trip_counts_by_date.show(100)

+-----------+-----+
|pickup_date|count|
+-----------+-----+
| 2019-10-01|59167|
| 2019-10-02|68402|
| 2019-10-03|71304|
| 2019-10-04|67969|
| 2019-10-05|52365|
| 2019-10-06|45732|
| 2019-10-07|65826|
| 2019-10-08|63800|
| 2019-10-09|60191|
| 2019-10-10|68231|
| 2019-10-11|67348|
| 2019-10-12|51455|
| 2019-10-13|45960|
| 2019-10-14|52478|
| 2019-10-15|62295|
| 2019-10-16|67868|
| 2019-10-17|67267|
| 2019-10-18|68336|
| 2019-10-19|52611|
| 2019-10-20|48316|
| 2019-10-21|61102|
| 2019-10-22|62590|
| 2019-10-23|66164|
| 2019-10-24|67316|
| 2019-10-25|67349|
| 2019-10-26|52537|
| 2019-10-27|47942|
| 2019-10-28|63750|
| 2019-10-29|65068|
| 2019-10-30|66225|
| 2019-10-31|63869|
| 2019-11-01|  397|
| 2019-11-02|    1|
| 2019-11-03| 6247|
| 2019-11-04|    1|
| 2019-11-05|    1|
| 2019-11-07|    1|
| 2019-11-11|    1|
| 2019-11-24|    1|
| 2019-11-28|    1|
| 2019-12-08|    2|
| 2019-12-30|    1|
| 2020-10-18|    1|
| 2020-10-26|    1|
| 2027-10-01|    1|
| 2029-11-01|    1|
| 2091-10-11|    1|


---

In [19]:
schema = types.StructType([
    types.StructField('Affiliated_base_number', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [20]:
df_sp = spark.read \
    .option("header", "true") \
    .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") \
    .schema(schema) \
    .csv('../data/fhv_tripdata_2019-10.csv',)

In [21]:
df_sp.show(10)

+----------------------+--------------------+-------------------+----------------+------------+------------+---------------+
|Affiliated_base_number|dispatching_base_num|    pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|        SR_Flag|
+----------------------+--------------------+-------------------+----------------+------------+------------+---------------+
|                B00009| 2019-10-01 00:23:00|2019-10-01 00:35:00|            null|         264|        null|         B00009|
|                B00013| 2019-10-01 00:11:29|2019-10-01 00:13:22|            null|         264|        null|         B00013|
|                B00014| 2019-10-01 00:11:43|2019-10-01 00:37:20|            null|         264|        null|         B00014|
|                B00014| 2019-10-01 00:56:29|2019-10-01 00:57:47|            null|         264|        null|         B00014|
|                B00014| 2019-10-01 00:23:09|2019-10-01 00:28:27|            null|         264|        null|         B00014|


In [None]:
# from raw csv dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number

In [22]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),  
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True),  
])

# Read the CSV file
df_sp = spark.read \
    .option("header", "true") \
    .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") \
    .schema(schema) \
    .csv('../data/fhv_tripdata_2019-10.csv')

# Show the first 10 rows to verify the data
df_sp.show(10)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   null|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   null|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   null|                B00014|
|     B00021         |2019-10-01 00:00:4

In [23]:
df_sp \
    .withColumn("pickup_datetime", F.col("pickup_datetime").cast("string")) \
    .withColumn("pickup_date", F.col("pickup_datetime").substr(1, 10)) \
    .withColumn('pickup_date', F.to_date('pickup_datetime')) \
    .groupBy("pickup_date").count().orderBy("pickup_date") \
    .show(100)


+-----------+-----+
|pickup_date|count|
+-----------+-----+
| 2019-10-01|59873|
| 2019-10-02|68746|
| 2019-10-03|71638|
| 2019-10-04|68227|
| 2019-10-05|52398|
| 2019-10-06|45665|
| 2019-10-07|66137|
| 2019-10-08|64049|
| 2019-10-09|60468|
| 2019-10-10|68559|
| 2019-10-11|67715|
| 2019-10-12|51434|
| 2019-10-13|45900|
| 2019-10-14|52665|
| 2019-10-15|62610|
| 2019-10-16|68156|
| 2019-10-17|67656|
| 2019-10-18|68471|
| 2019-10-19|52530|
| 2019-10-20|48304|
| 2019-10-21|61381|
| 2019-10-22|62950|
| 2019-10-23|66429|
| 2019-10-24|67663|
| 2019-10-25|67648|
| 2019-10-26|52569|
| 2019-10-27|47708|
| 2019-10-28|64075|
| 2019-10-29|65393|
| 2019-10-30|66504|
| 2019-10-31|63972|
+-----------+-----+



In [24]:
df_sp \
    .withColumn('pickup_date', F.to_date('pickup_datetime')) \
    .filter(F.col("pickup_date") == "2019-10-15") \
    .count()

62610