In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, count, date_format

# Create SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("ex7-q1") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Set log level
spark.sparkContext.setLogLevel("ERROR")

# Load the RDD
rdd = spark.sparkContext.textFile("file:///home/ahmad/bd-analytics/data/725053-94728-2022")

# Filter and map to extract day and temperature
temperature_rdd = rdd.filter(lambda line: len(line) >= 92) \
    .map(lambda line: (line[15:23], line[87], float(line[88:92]) / 10))   \
    .filter(lambda x: x[2] != 999.9) # ignore missing values usually reported as 9999 in ncdc noaa dataset

#print(temperature_rdd.collect())

# Adjust the temperature based on the sign
adjusted_temperature_rdd = temperature_rdd.map(lambda x: (x[0], x[2] * (-1 if x[1] == '-' else 1)))

# Reduce by key to calculate the sum and count of temperatures for each day
# ex : (day1 , (temp1, 1)), (day1, (temp2, 1)) => (day1, (temp1 + temp2, 1 + 1))
sum_count_rdd = adjusted_temperature_rdd.mapValues(lambda temp: (temp, 1)) \
    .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# Map again to calculate the average temperature for each day
average_temperature_rdd = sum_count_rdd.mapValues(lambda x: x[0] / x[1])

#print(average_temperature_rdd.collect())

# Convert RDD to DataFrame
temperature_df = average_temperature_rdd.toDF(["Day", "AverageTemperature"])

#convert the day column to date type manually YYYYMMDD to YYYY-MM-DD
temperature_df = temperature_df.withColumn("Day", to_date(col("Day"), "yyyyMMdd"))
# Show the DataFrame
temperature_df.show()



24/03/28 15:56:16 WARN Utils: Your hostname, ahmad-pc resolves to a loopback address: 127.0.1.1; using 192.168.65.9 instead (on interface enp0s1)
24/03/28 15:56:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/28 15:56:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+----------+-------------------+
|       Day| AverageTemperature|
+----------+-------------------+
|2022-01-03|               0.49|
|2022-01-04|-3.7500000000000013|
|2022-01-05|  3.311363636363637|
|2022-01-08|-3.8000000000000007|
|2022-01-09| 1.4249999999999996|
|2022-01-14| 4.1645161290322585|
|2022-01-16| -7.924999999999999|
|2022-01-18| 0.8083333333333335|
|2022-01-22| -6.391666666666666|
|2022-01-25| 2.9458333333333333|
|2022-01-28|-0.6297872340425534|
|2022-01-31| -3.895833333333332|
|2022-02-01|-1.8833333333333322|
|2022-02-02|  3.531111111111111|
|2022-02-04|  4.135714285714287|
|2022-02-07| 0.8871428571428575|
|2022-02-08| 2.8500000000000014|
|2022-02-09| 2.1333333333333333|
|2022-02-11|  8.591666666666667|
|2022-02-13| 0.3303030303030298|
+----------+-------------------+
only showing top 20 rows



In [2]:
# Read the taxi data
taxi_data = spark.read.option("header", "true").option("inferschema", "true").parquet("file:///home/ahmad/bd-analytics/data/all-2022-cleaned.parquet")


# Extract the date from the pickup datetime 
taxi_data = taxi_data.withColumn("pickup_date", to_date(col("tpep_pickup_datetime")))


# Group by date and count the number of trips for each day
daily_trip_count = taxi_data.groupBy("pickup_date").agg(count("*").alias("trip_count"))

# Show the daily trip count
daily_trip_count.show()




+-----------+----------+
|pickup_date|trip_count|
+-----------+----------+
| 2022-10-19|    120603|
| 2022-10-11|    111627|
| 2022-11-01|    111385|
| 2022-10-06|    114403|
| 2022-10-23|     97313|
| 2022-10-18|    116275|
| 2022-10-07|    115382|
| 2022-10-20|    123044|
| 2022-10-15|    117597|
| 2022-10-24|    102383|
| 2022-10-14|    117868|
| 2022-10-05|    107012|
| 2022-10-08|    113332|
| 2022-10-26|    119781|
| 2022-10-21|    119621|
| 2022-10-10|     88671|
| 2022-10-04|    101071|
| 2022-10-17|    103878|
| 2022-10-27|    122828|
| 2022-10-30|     99003|
+-----------+----------+
only showing top 20 rows



                                                                                

In [3]:
#join the temperature data with the daily trip count
joined_data = temperature_df.join(daily_trip_count, temperature_df.Day == daily_trip_count.pickup_date)
#remove the pickup_date column
joined_data = joined_data.drop("pickup_date")
joined_data.show()

                                                                                

+----------+------------------+----------+
|       Day|AverageTemperature|trip_count|
+----------+------------------+----------+
|2022-10-19| 8.779166666666663|    120603|
|2022-10-11|15.504166666666668|    111627|
|2022-11-01|16.265151515151512|    111385|
|2022-10-06|15.816666666666666|    114403|
|2022-10-23|14.072413793103447|     97313|
|2022-10-18|11.589285714285714|    116275|
|2022-10-07|           19.4875|    115382|
|2022-10-20|10.250000000000002|    123044|
|2022-10-15|14.937499999999998|    117597|
|2022-10-24|13.958823529411768|    102383|
|2022-10-14|13.920370370370366|    117868|
|2022-10-05|13.293877551020405|    107012|
|2022-10-08|12.937499999999998|    113332|
|2022-10-26|18.860377358490567|    119781|
|2022-10-21|12.124999999999998|    119621|
|2022-10-10|           14.8125|     88671|
|2022-10-04|  9.64901960784314|    101071|
|2022-10-17|15.890000000000002|    103878|
|2022-10-27|15.975000000000001|    122828|
|2022-10-30|           11.4125|     99003|
+----------

In [6]:
import altair as alt

joined_data = joined_data.withColumn("Day", date_format(col("Day"), "yyyy-MM-dd"))
scatter_plot = alt.Chart(joined_data.toPandas()).mark_circle().encode(
    x=alt.X('AverageTemperature', title='Average Temperature (°C)', scale=alt.Scale(zero=False)),
    y=alt.Y('trip_count', title='Trip Count'),
    tooltip=['Day', 'AverageTemperature', 'trip_count']
).interactive().properties(
    width=800,  # Set the width of the chart
    height=600  # Set the height of the chart
)

# Save the plot to a file
scatter_plot.save('scatter_plot.html')


                                                                                

In [None]:
scatter_plot.show()