In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

credentials_location = '/home/datatalks_jan/.google/credentials/google_credentials.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test_again') \
    .set("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

df_green = spark.read.parquet('gs://nyc-tlc-backup/pq/green/*/*') #测试是否可以读

24/03/14 17:18:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

In [2]:
df_green.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2020-01-23 13:10:15|  2020-01-23 13:38:16|                 N|         1|          74|         130|              1|        12.77|       36.0|  0.0|    0.

                                                                                

In [10]:
from pyspark.sql import types
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType

schema = StructType ([StructField('hvfhs_license_num',StringType(),True),\
                     StructField('dispatching_base_num',StringType(),True),\
                     StructField('pickup_datetime',TimestampType(),True),\
                     StructField('dropoff_datetime',TimestampType(),True),\
                     StructField('PULocationID',IntegerType(),True),\
                     StructField('DOLocationID',IntegerType(),True),\
                    StructField('SR_Flag',StringType(),True)])

In [12]:
raw_data = spark.read.option("header",True).csv("/home/datatalks_jan/Data_Eden/8_pySpark_pilot/head_*.csv", schema=schema)

In [13]:
raw_data.printSchema()
raw_data.count()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



                                                                                

1002

In [14]:
data_without_bad = raw_data.filter(raw_data.hvfhs_license_num != "BAD") 
data_without_bad.count()

1001

In [15]:
from pyspark.sql.functions import col, to_date

data_older_than_summer_2021 = data_without_bad.filter(col("pickup_datetime") != "2021-01-01 00:21:08") 
data_older_than_summer_2021.count()


1000

In [16]:
from pyspark.sql import functions as F

data_clean=data_older_than_summer_2021.withColumn('pickup_datetime',F.to_date(data_older_than_summer_2021.pickup_datetime)) \
.withColumn('dropoff_datetime',F.to_date(data_older_than_summer_2021.dropoff_datetime)) \
.withColumn('operator', F.lit('ZHE'))

In [17]:
data_clean.show()

+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|hvfhs_license_num|dispatching_base_num|pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|SR_Flag|operator|
+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         230|         166|   null|     ZHE|
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         152|         167|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         233|         142|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         142|         143|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         143|          78|   null|     ZHE|
|           HV0005|              B02510|     2021-01-01|      20

In [18]:
data_clean.createOrReplaceTempView('etl_1')

In [19]:
final_df = spark.sql("""
SELECT
    *
FROM
    etl_1
WHERE 1=1
AND hvfhs_license_num = 'HV0003'
""")

In [20]:
final_df.show()

+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|hvfhs_license_num|dispatching_base_num|pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|SR_Flag|operator|
+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         230|         166|   null|     ZHE|
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         152|         167|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         233|         142|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         142|         143|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         143|          78|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      20

In [21]:
final_df.write.parquet("gs://nyc-tlc-backup/test_output", mode="overwrite")

                                                                                

In [240]:
#locally only can run one session / context therefore we need to stop when to create new one 
#spark.stop()
#sc.stop()
#spark.sparkContext.stop()