In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *
import os


In [None]:
spark = (
    SparkSession.builder
    .appName("Exercise6-Optimized")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)
spark


In [None]:
csv_files_path = "csv_files"
# Check if the folder exists
if not os.path.exists(csv_files_path):
    raise FileNotFoundError(f"Folder not found: {csv_files_path}")
else:
    print(f"Found CSV folder: {csv_files_path}")


In [None]:
csv_files_path = "csv_files"
# Check if the folder exists
if not os.path.exists(csv_files_path):
    raise FileNotFoundError(f"Folder not found: {csv_files_path}")
else:
    print(f"Found CSV folder: {csv_files_path}")


In [None]:
df = spark.read.csv("csv_files/Divvy_Trips_2019_Q4.csv",header=True,inferSchema=True)
print("Data loaded successfully!")

In [None]:
df.count()

In [None]:
df.show()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
df1 = spark.read.csv("csv_files/Divvy_Trips_2020_Q1.csv",header=True,inferSchema=True)
print("Data loaded successfully!")

In [None]:
df.show()

In [None]:
df1.show()

### Questions for Divvy_Trips_2019_Q4.csv file

### Q1. What are the `average` trip duration per day?

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import *

In [None]:
def average_trip_duration_per_day(df):
    df_cleaned = df.withColumn("tripduration",regexp_replace(col("tripduration"), ",", "").cast(DoubleType()).cast(IntegerType()))

    df_with_date = df_cleaned.withColumn("date",to_date(col("start_time")))

    df_avg_per_day = df_with_date.groupBy("date").agg(avg("tripduration").alias("avg_trip_duration_raw"))
    
    df_rounded = df_avg_per_day.withColumn("avg_trip_duration",round(col("avg_trip_duration_raw"), 2)).drop("avg_trip_duration_raw")

    return df_rounded


In [None]:
result = average_trip_duration_per_day(df)
result.show()

In [None]:
result.write.mode("overwrite").option("header", "True").csv("reports/average_trips_per_day")

### Q2. How many trips were taken per day?

In [None]:
df.show()

In [None]:
def total_trips_per_day(df):
    df_with_date = df.withColumn("date", to_date(col("start_time")))

    df_total_per_day = df_with_date.groupBy("date") \
                        .agg(count("trip_id").alias("total_trips_per_day")) \
                        .orderBy("date")
    df_total_per_day.show()
    return df_total_per_day


In [None]:
result = total_trips_per_day(df)
result.show()

In [None]:
result.write.mode("overwrite").option("header", "True").csv("reports/total_trips_per_day")

### Q3. What was the most popular starting trip station for each month?

In [None]:
df.show()

In [None]:
def most_popular_station_each_month(df):
    df_month = df.withColumn("month", month(col("start_time")))
    df_count = df_month.groupBy("month", "from_station_name").agg(count("*").alias("trip_count"))
    window = Window.partitionBy("month").orderBy(col("trip_count").desc())
    df_ranked = df_count.withColumn("rank", row_number().over(window))
    df_most_popular = df_ranked.filter(col("rank") == 1).orderBy("month")
    df_most_popular.show()
    return df_most_popular

In [None]:
result = most_popular_station_each_month(df)
result.show()

In [None]:
df.show()

### Q4. What were the top 3 trip stations each day for the last two weeks?

Here is the logic you should use in PySpark:

Step-by-step approach

1. Clean tripduration (if required)

2. Extract date

3. Filter only last 14 days

4. Group by date + start_station

5. rank them

6. take top 3 per day

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

def top_3_trip_count(df):
    df_cleaned = df.withColumn("tripduration",regexp_replace(col("tripduration"), ",", "").cast(DoubleType()).cast("int"))
    df_with_date = df_cleaned.withColumn("date", to_date(col("start_time")))
    max_date = df_with_date.agg(F.max("date")).first()[0]
    last_14 = df_with_date.filter(col("date") > F.date_sub(F.lit(max_date), 14))
    daily_counts = last_14.groupBy("date", "from_station_name").agg(F.count("*").alias("trip_count"))
    window_func = Window.partitionBy("date").orderBy(col("trip_count").desc())
    top3 = daily_counts.select('*', rank().over(window_func).alias('rank')).filter(col('rank') <= 3)
    return top3


In [102]:
result_top3 = top_3_trip_count(df)
result_top3.show()

[Stage 172:>                                                        (0 + 4) / 4]

+----------+--------------------+----------+----+
|      date|   from_station_name|trip_count|rank|
+----------+--------------------+----------+----+
|2019-12-18| Canal St & Adams St|       123|   1|
|2019-12-18|Clinton St & Madi...|       115|   2|
|2019-12-18|Clinton St & Wash...|        94|   3|
|2019-12-19| Canal St & Adams St|       133|   1|
|2019-12-19|Clinton St & Madi...|       123|   2|
|2019-12-19|Clinton St & Wash...|        95|   3|
|2019-12-20| Canal St & Adams St|       131|   1|
|2019-12-20|Clinton St & Wash...|       109|   2|
|2019-12-20|Clinton St & Madi...|        94|   3|
|2019-12-21|Streeter Dr & Gra...|        63|   1|
|2019-12-21|Kingsbury St & Ki...|        47|   2|
|2019-12-21|Wells St & Concor...|        46|   3|
|2019-12-22|      Shedd Aquarium|        87|   1|
|2019-12-22|Lake Shore Dr & M...|        79|   2|
|2019-12-22|Streeter Dr & Gra...|        70|   3|
|2019-12-23| Canal St & Adams St|       109|   1|
|2019-12-23|Clinton St & Madi...|        87|   2|


                                                                                

In [None]:
df.printSchema()

### Q5. Do `Male's` or `Female's` take longer trips on average?

In [None]:
df_result = df.filter((F.col("gender").isNotNull()))\
    .groupBy("gender") \
    .agg(F.count("*").alias("Total Counts"))

In [None]:
df_result.show()

### Q6. What is the top 10 ages of those that take the longest trips, and shortest?

In [None]:
df = df.withColumn("birth_date",F.to_date(F.concat(F.lit("01-01-"), F.col("birthyear")), "dd-MM-yyyy"))

In [None]:
df = df.withColumn("age",floor(datediff(current_date(), col("birth_date")) / 365.25))
df.show()

In [101]:
df.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+----------+----+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|birth_date| age|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+----------+----+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|       940.0|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|1987-01-01|  38|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|       258.0|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|1998-01-01|  27|
|25223642|2019-10-01 00:04:32|2019-10-01 00:18:43|  3003|       8

In [112]:
df = df.withColumn("tripduration", regexp_replace(col("tripduration"), ",", "").cast("double").cast("int"))
df = df.withColumn("date", to_date(col("start_time")))

In [115]:
df.show()

+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+----------+----+----------+
| trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|birth_date| age|      date|
+--------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+----------+----+----------+
|25223640|2019-10-01 00:01:39|2019-10-01 00:17:20|  2215|         940|             20|Sheffield Ave & K...|          309|Leavitt St & Armi...|Subscriber|  Male|     1987|1987-01-01|  38|2019-10-01|
|25223641|2019-10-01 00:02:16|2019-10-01 00:06:34|  6328|         258|             19|Throop (Loomis) S...|          241| Morgan St & Polk St|Subscriber|  Male|     1998|1998-01-01|  27|2019-10-01|
|25223642|

In [118]:
top10_longest = df.filter(F.col('age').isNotNull()) \
                .orderBy(F.col("tripduration").desc()) \
                .select('age', 'tripduration').limit(10)

In [119]:
top10_longest.show()

[Stage 196:>                                                        (0 + 4) / 4]

+---+------------+
|age|tripduration|
+---+------------+
| 38|     6165373|
| 26|     6039942|
| 39|     5169622|
| 31|     4809091|
| 34|     4123040|
| 31|     3512685|
| 43|     3246842|
| 43|     3047069|
| 35|     2910292|
| 55|     2708185|
+---+------------+



                                                                                

In [124]:
top10_shortest = df.filter(F.col('age').isNotNull()) \
                .orderBy(F.col("tripduration").desc()) \
                .select('age', 'tripduration').limit(10)

In [126]:
top10_shortest.show()



+---+------------+
|age|tripduration|
+---+------------+
| 38|     6165373|
| 26|     6039942|
| 39|     5169622|
| 31|     4809091|
| 34|     4123040|
| 31|     3512685|
| 43|     3246842|
| 43|     3047069|
| 35|     2910292|
| 55|     2708185|
+---+------------+



                                                                                