In [75]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *
import os

In [76]:
spark = (
    SparkSession.builder
    .appName("Exercise6-Optimized")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)
spark

In [77]:
csv_files_path = "csv_files"
# Check if the folder exists
if not os.path.exists(csv_files_path):
    raise FileNotFoundError(f"Folder not found: {csv_files_path}")
else:
    print(f"Found CSV folder: {csv_files_path}")

Found CSV folder: csv_files


In [78]:
df = spark.read.csv("csv_files/Divvy_Trips_2019_Q4.csv",header=True,inferSchema=True)
print("Data loaded successfully!")

[Stage 66:>                                                         (0 + 4) / 4]

Data loaded successfully!


                                                                                

In [79]:
df.count()

704054

In [80]:
df.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|       940.0|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|       258.0|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|
|25223642|2019-10-01 00:04:32|2019-10-01 00:18:43|  3003|       850.0|             84|Milwaukee Ave & G...|          199|Wabash Ave & Gran...|Sub

In [81]:
df.write.partitionBy("from_station_name").mode("overwrite").csv("partition_files")

                                                                                

In [82]:
df.rdd.getNumPartitions()

4

In [83]:
num_partitions = len(df.rdd.glom().collect())
print("Number of partitions:", num_partitions)

                                                                                

Number of partitions: 4


In [84]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [85]:
path = "partition_files"

fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
p = sc._jvm.org.apache.hadoop.fs.Path(path)

# Get all subdirectories (i.e. partitions)
statuses = fs.listStatus(p)

partition_dirs = [f.getPath().getName() for f in statuses if f.isDirectory()]

print(f"Total number of partitions: {len(partition_dirs)}")

Total number of partitions: 610


In [86]:
df1 = spark.read.csv("csv_files/Divvy_Trips_2020_Q1.csv",header=True,inferSchema=True)
print("Data loaded successfully!")

Data loaded successfully!


                                                                                

In [87]:
df.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|       940.0|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|       258.0|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|
|25223642|2019-10-01 00:04:32|2019-10-01 00:18:43|  3003|       850.0|             84|Milwaukee Ave & G...|          199|Wabash Ave & Gran...|Sub

In [88]:
df1.show()

+----------------+-------------+-------------------+-------------------+--------------------+----------------+--------------------+--------------+---------+---------+-------+--------+-------------+
|         ride_id|rideable_type|         started_at|           ended_at|  start_station_name|start_station_id|    end_station_name|end_station_id|start_lat|start_lng|end_lat| end_lng|member_casual|
+----------------+-------------+-------------------+-------------------+--------------------+----------------+--------------------+--------------+---------+---------+-------+--------+-------------+
|EACB19130B0CDA4A|  docked_bike|2020-01-21 20:06:59|2020-01-21 20:14:30|Western Ave & Lel...|             239|Clark St & Leland...|           326|  41.9665| -87.6884|41.9671|-87.6674|       member|
|8FED874C809DC021|  docked_bike|2020-01-30 14:22:39|2020-01-30 14:26:22|Clark St & Montro...|             234|Southport Ave & I...|           318|  41.9616|  -87.666|41.9542|-87.6644|       member|
|789F3C21E

In [89]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [90]:
df.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|       940.0|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|       258.0|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|
|25223642|2019-10-01 00:04:32|2019-10-01 00:18:43|  3003|       850.0|             84|Milwaukee Ave & G...|          199|Wabash Ave & Gran...|Sub

In [91]:
# def average_trip_duration_per_day(df):
#     result = df.groupBy("tripduration").agg(F.avg("tripduration").alias("avg_trip_duration"))
#     return result

# avg_trip = average_trip_duration_per_day(df)
# avg_trip.show(10)


In [92]:
df.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|       940.0|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|       258.0|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|
|25223642|2019-10-01 00:04:32|2019-10-01 00:18:43|  3003|       850.0|             84|Milwaukee Ave & G...|          199|Wabash Ave & Gran...|Sub

### Q1. What is the `average` trip duration per day?

In [93]:
df_cleaned = df.withColumn("tripduration", regexp_replace(df["tripduration"], "[,]", ""))

In [94]:
df_cleaned.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|       940.0|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|       258.0|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|
|25223642|2019-10-01 00:04:32|2019-10-01 00:18:43|  3003|       850.0|             84|Milwaukee Ave & G...|          199|Wabash Ave & Gran...|Sub

In [95]:
print(df_cleaned.dtypes)

[('trip_id', 'int'), ('start_time', 'timestamp'), ('end_time', 'timestamp'), ('bikeid', 'int'), ('tripduration', 'string'), ('from_station_id', 'int'), ('from_station_name', 'string'), ('to_station_id', 'int'), ('to_station_name', 'string'), ('usertype', 'string'), ('gender', 'string'), ('birthyear', 'int')]


In [96]:
df_cleaned = df_cleaned.withColumn("tripduration", F.col("tripduration").cast(DoubleType()).cast(IntegerType()))

In [97]:
df_cleaned.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|         940|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|         258|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|
|25223642|2019-10-01 00:04:32|2019-10-01 00:18:43|  3003|         850|             84|Milwaukee Ave & G...|          199|Wabash Ave & Gran...|Sub

In [98]:
df_update_date = df.withColumn("date", to_date(col("date")))

{"ts": "2025-11-04 19:09:28.351", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `date` cannot be resolved. Did you mean one of the following? [`bikeid`, `end_time`, `gender`, `usertype`, `birthyear`]. SQLSTATE: 42703", "context": {"file": "line 1 in cell [98]", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o2805.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `date` cannot be resolved. Did you mean one of the following? [`bikeid`, `end_time`, `gender`, `usertype`, `birthyear`]. SQLSTATE: 42703;\n'Project [trip_id#1475, start_time#1476, end_time#1477, bikeid#1478, tripduration#1479, from_station_id#1480, from_station_name#1481, to_station_id#1482, to_station_name#1483, us

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `date` cannot be resolved. Did you mean one of the following? [`bikeid`, `end_time`, `gender`, `usertype`, `birthyear`]. SQLSTATE: 42703;
'Project [trip_id#1475, start_time#1476, end_time#1477, bikeid#1478, tripduration#1479, from_station_id#1480, from_station_name#1481, to_station_id#1482, to_station_name#1483, usertype#1484, gender#1485, birthyear#1486, 'to_date('date) AS date#1909]
+- Relation [trip_id#1475,start_time#1476,end_time#1477,bikeid#1478,tripduration#1479,from_station_id#1480,from_station_name#1481,to_station_id#1482,to_station_name#1483,usertype#1484,gender#1485,birthyear#1486] csv
