In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *
import os


In [None]:
spark = (
    SparkSession.builder
    .appName("Exercise6-Optimized")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)
spark


In [None]:
csv_files_path = "csv_files"
# Check if the folder exists
if not os.path.exists(csv_files_path):
    raise FileNotFoundError(f"Folder not found: {csv_files_path}")
else:
    print(f"Found CSV folder: {csv_files_path}")


In [None]:
csv_files_path = "csv_files"
# Check if the folder exists
if not os.path.exists(csv_files_path):
    raise FileNotFoundError(f"Folder not found: {csv_files_path}")
else:
    print(f"Found CSV folder: {csv_files_path}")


In [None]:
df = spark.read.csv("csv_files/Divvy_Trips_2019_Q4.csv",header=True,inferSchema=True)
print("Data loaded successfully!")

In [None]:
df.count()

In [None]:
df.show()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
df1 = spark.read.csv("csv_files/Divvy_Trips_2020_Q1.csv",header=True,inferSchema=True)
print("Data loaded successfully!")

In [None]:
df.show()

In [None]:
df1.show()

### Questions for Divvy_Trips_2019_Q4.csv file

### Q1. What are the `average` trip duration per day?

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import *

In [None]:
def average_trip_duration_per_day(df):
    df_cleaned = df.withColumn("tripduration",regexp_replace(col("tripduration"), ",", "").cast(DoubleType()).cast(IntegerType()))

    df_with_date = df_cleaned.withColumn("date",to_date(col("start_time")))

    df_avg_per_day = df_with_date.groupBy("date").agg(avg("tripduration").alias("avg_trip_duration_raw"))
    
    df_rounded = df_avg_per_day.withColumn("avg_trip_duration",round(col("avg_trip_duration_raw"), 2)).drop("avg_trip_duration_raw")

    return df_rounded


In [None]:
result = average_trip_duration_per_day(df)
result.show()

In [None]:
result.write.mode("overwrite").option("header", "True").csv("reports/average_trips_per_day")

### Q2. How many trips were taken per day?

In [None]:
df.show()

In [None]:
def total_trips_per_day(df):
    df_with_date = df.withColumn("date", to_date(col("start_time")))

    df_total_per_day = df_with_date.groupBy("date") \
                        .agg(count("trip_id").alias("total_trips_per_day")) \
                        .orderBy("date")
    df_total_per_day.show()
    return df_total_per_day


In [None]:
result1 = total_trips_per_day(df)
result1.show()
result1.write.mode("overwrite").option("header", "True").csv("reports/trips_taken_per_day")

### Q3. What was the most popular starting trip station for each month?

In [None]:
df.show()

In [None]:
def most_popular_station_each_month(df):
    df_month = df.withColumn("month", month(col("start_time")))
    df_count = df_month.groupBy("month", "from_station_name").agg(count("*").alias("trip_count"))
    window = Window.partitionBy("month").orderBy(col("trip_count").desc())
    df_ranked = df_count.withColumn("rank", row_number().over(window))
    df_most_popular = df_ranked.filter(col("rank") == 1).orderBy("month")
    df_most_popular.show()
    return df_most_popular

In [None]:
result2 = most_popular_station_each_month(df)
result2.show()
result2.write.mode("overwrite").option("header", "True").csv("reports/most_popular_station_each_month")

In [None]:
df.show()

### Q4. What were the top 3 trip stations each day for the last two weeks?

Here is the logic you should use in PySpark:

Step-by-step approach

1. Clean tripduration (if required)

2. Extract date

3. Filter only last 14 days

4. Group by date + start_station

5. rank them

6. take top 3 per day

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

def top_3_trip_count(df):
    df_cleaned = df.withColumn("tripduration",regexp_replace(col("tripduration"), ",", "").cast(DoubleType()).cast("int"))
    df_with_date = df_cleaned.withColumn("date", to_date(col("start_time")))
    max_date = df_with_date.agg(F.max("date")).first()[0]
    last_14 = df_with_date.filter(col("date") > F.date_sub(F.lit(max_date), 14))
    daily_counts = last_14.groupBy("date", "from_station_name").agg(F.count("*").alias("trip_count"))
    window_func = Window.partitionBy("date").orderBy(col("trip_count").desc())
    top3 = daily_counts.select('*', rank().over(window_func).alias('rank')).filter(col('rank') <= 3)
    return top3


In [None]:
result_top3 = top_3_trip_count(df)
result_top3.show()
result_top3.write.mode("overwrite").option("header", "True").csv("reports/top_3_trip_count")

In [None]:
df.printSchema()

In [None]:
df = df.withColumn("birth_date",F.to_date(F.concat(F.lit("01-01-"), F.col("birthyear")), "dd-MM-yyyy"))
df = df.withColumn("age",floor(datediff(current_date(), col("birth_date")) / 365.25))
df = df.withColumn("tripduration", regexp_replace(col("tripduration"), ",", "").cast("double").cast("int"))
df = df.withColumn("date", to_date(col("start_time")))

### Q5. Do `Male's` or `Female's` take longer trips on average?

In [None]:
df_gender_avg = df.filter(col("gender").isNotNull()) \
    .groupBy("gender") \
    .agg(round(avg("tripduration"), 2).alias("Average Trip Duration")) \
    .orderBy("Average Trip Duration", ascending=False)

df_gender_avg.show()

In [None]:
df_gender_avg.write.mode("overwrite").option("header", "True").csv("reports/male_or_female")

### Q6. What is the top 10 ages of those that take the longest trips, and shortest?

In [66]:
def top_10_age_long_short(df):
    df = df.withColumn("birth_date", to_date(concat(lit("01-01-"), col("birthyear")), "dd-MM-yyyy"))
    df = df.withColumn("age", floor(datediff(current_date(), col("birth_date")) / 365.25))
    df = df.withColumn("tripduration", regexp_replace(col("tripduration"), ",", "").cast("double").cast("int"))
    df_age = df.filter(col("age").isNotNull()) \
               .groupBy("age") \
               .agg(
                    max("tripduration").alias("max_tripduration"),
                    min("tripduration").alias("min_tripduration")
               )
    top10_longest = df_age.orderBy(col("max_tripduration").desc()).limit(10)
    top10_shortest = df_age.orderBy(col("min_tripduration").asc()).limit(10)
    top10_longest = top10_longest.select("age", "max_tripduration")
    top10_shortest = top10_shortest.select("age", "min_tripduration")
    return top10_longest, top10_shortest


In [70]:
df.select("tripduration").agg(max("tripduration")).show()



+-----------------+
|max(tripduration)|
+-----------------+
|          8585902|
+-----------------+



                                                                                

In [68]:
df_age = df.filter(col("age").isNotNull()).groupBy("age").agg(max("tripduration").alias("max_tripduration"), min("tripduration").alias("min_tripduration")).show()



+---+----------------+----------------+
|age|max_tripduration|min_tripduration|
+---+----------------+----------------+
| 34|         4123040|              62|
| 31|         4809091|              61|
| 30|         1011368|              62|
| 48|          114121|              61|
| 33|         1673867|              61|
| 61|          169715|              65|
| 25|         2147948|              61|
| 29|          277483|              61|
| 28|          354731|              63|
| 26|         6039942|              61|
| 42|          665467|              61|
| 54|          162375|              67|
| 62|           82908|              62|
| 67|          367103|              96|
| 73|           78364|              65|
| 81|            5216|             161|
| 23|          287297|             107|
| 88|             419|             265|
| 32|         1882691|              61|
| 49|          458113|              61|
+---+----------------+----------------+
only showing top 20 rows


                                                                                

In [67]:
top10_longest, top10_shortest = top_10_age_long_short(df)
top10_longest.show()
top10_shortest.show()
top10_longest.write.mode("overwrite").option("header", "True").csv("reports/top_10_age_longest")
top10_shortest.write.mode("overwrite").option("header", "True").csv("reports/top_10_age_shortest")

                                                                                

+---+----------------+
|age|max_tripduration|
+---+----------------+
| 38|         6165373|
| 26|         6039942|
| 39|         5169622|
| 31|         4809091|
| 34|         4123040|
| 43|         3246842|
| 35|         2910292|
| 55|         2708185|
| 25|         2147948|
| 32|         1882691|
+---+----------------+



                                                                                

+---+----------------+
|age|min_tripduration|
+---+----------------+
| 42|              61|
| 32|              61|
| 59|              61|
| 49|              61|
| 48|              61|
| 40|              61|
| 25|              61|
| 36|              61|
| 26|              61|
| 37|              61|
+---+----------------+



                                                                                