In [217]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

credentials_location = '/home/datatalks_jan/.google/credentials/google_credentials.json'

spark = SparkSession.builder.appName('test').master("local[*]") \
    .config("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location) \
    .getOrCreate()

sc = spark.sparkContext

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [218]:
schema = StructType ([StructField('hvfhs_license_num',StringType(),True),\
                     StructField('dispatching_base_num',StringType(),True),\
                     StructField('pickup_datetime',TimestampType(),True),\
                     StructField('dropoff_datetime',TimestampType(),True),\
                     StructField('PULocationID',IntegerType(),True),\
                     StructField('DOLocationID',IntegerType(),True),\
                    StructField('SR_Flag',StringType(),True)])

In [219]:
raw_data = spark.read.option("header",True).csv("/home/datatalks_jan/Data_Eden/8_pySpark_pilot/head_*.csv", schema=schema)

In [220]:
raw_data.printSchema()
raw_data.count()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



1002

In [221]:
data_without_bad = raw_data.filter(raw_data.hvfhs_license_num != "BAD") 
data_without_bad.count()

1001

In [222]:
from pyspark.sql.functions import col, to_date

data_older_than_summer_2021 = data_without_bad.filter(col("pickup_datetime") != "2021-01-01 00:21:08") 
data_older_than_summer_2021.count()


1000

In [223]:
from pyspark.sql import functions as F

data_clean=data_older_than_summer_2021.withColumn('pickup_datetime',F.to_date(data_older_than_summer_2021.pickup_datetime)) \
.withColumn('dropoff_datetime',F.to_date(data_older_than_summer_2021.dropoff_datetime)) \
.withColumn('operator', F.lit('ZHE'))

In [224]:
data_clean.show()

+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|hvfhs_license_num|dispatching_base_num|pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|SR_Flag|operator|
+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         230|         166|   null|     ZHE|
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         152|         167|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         233|         142|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         142|         143|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         143|          78|   null|     ZHE|
|           HV0005|              B02510|     2021-01-01|      20

In [225]:
data_clean.createOrReplaceTempView('etl_1')

In [226]:
final_df = spark.sql("""
SELECT
    *
FROM
    etl_1
WHERE 1=1
AND hvfhs_license_num = 'HV0003'
""")

In [227]:
final_df.show()

+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|hvfhs_license_num|dispatching_base_num|pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|SR_Flag|operator|
+-----------------+--------------------+---------------+----------------+------------+------------+-------+--------+
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         230|         166|   null|     ZHE|
|           HV0003|              B02682|     2021-01-01|      2021-01-01|         152|         167|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         233|         142|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         142|         143|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      2021-01-01|         143|          78|   null|     ZHE|
|           HV0003|              B02764|     2021-01-01|      20

In [228]:
final_df.write.parquet("gs://nyc-tlc-backup/test_output", mode="overwrite")

Py4JJavaError: An error occurred while calling o1127.parquet.
: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2688)
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWritingFileFormat(DataSource.scala:454)
	at org.apache.spark.sql.execution.datasources.DataSource.planForWriting(DataSource.scala:530)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:789)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: Class com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem not found
	at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2592)
	at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2686)
	... 25 more


In [215]:
#spark.stop()

In [216]:
#sc.stop()