In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType
from pyspark.sql import functions as sf

path = "/sparkdata/taxi.csv"

schema = StructType([
  StructField('driver_name', StringType(), False),
  StructField('client_name', StringType(), False),
  StructField('start_point', StringType(), False),
  StructField('end_point', StringType(), False),
  StructField('start_time', TimestampType(), False),
  StructField('end_time', TimestampType(), False),
  StructField('cost', DoubleType(), False),
  StructField('driver_rating', IntegerType(), True),
  StructField('driver_feedback', StringType(), True),
  StructField('driver_comment', StringType(), True),
  StructField('client_rating', IntegerType(), True),
  StructField('client_feedback', StringType(), True)
])

spark = SparkSession.builder.appName("Lab5").getOrCreate()
spark.sparkContext.setLogLevel("OFF")

df = spark.read.option("header", True).schema(schema).csv(path)

driver_income = df.select("driver_name", "cost").groupBy("driver_name").agg(sf.sum("cost").alias("income"))

top_drivers = driver_income.orderBy(sf.col("income").desc()).limit(100)

top_drivers.show()

print(top_drivers.collect())

spark.stop()