# Practice 2

In [21]:
# Sample data representing speed, engine temperature in Celsius, and fuel efficiency (mpg)
sensor_data = [
    [60, 75, 7.5], [70, 80, 8.2], [65, 77, 7.8], [80, 90, 6.9],
    [50, 68, 9.1], [75, 88, 7.0], [62, 74, 7.7], [78, 85, 6.8]
]

In [22]:
import pyspark
sc = pyspark.SparkContext('local[*]', 'RDD practice')

# numSlices=3: This tells Spark to divide the data into 3 partitions. Each partition is processed in parallel by the Spark engine.
rdd = sc.parallelize(sensor_data, numSlices=3)

In [25]:
# Step 3: Apply Transformations

# Convert temperature to Fahrenheit and filter by speed > 65 MPH

def celsius_to_fahrenheit(temp):
    return (temp * 9/5) + 32

rdd_converted = rdd.map(lambda x: [x[0], celsius_to_fahrenheit(x[1]), x[2]])
rdd_converted.take(40)

[[60, 167.0, 7.5],
 [70, 176.0, 8.2],
 [65, 170.6, 7.8],
 [80, 194.0, 6.9],
 [50, 154.4, 9.1],
 [75, 190.4, 7.0],
 [62, 165.2, 7.7],
 [78, 185.0, 6.8]]

In [24]:
filtered_rdd = rdd_converted.filter(lambda x: x[0] > 65)
filtered_rdd.take(40)

[[70, 176.0, 8.2], [80, 194.0, 6.9], [75, 190.4, 7.0], [78, 185.0, 6.8]]

In [26]:
# Sorted by Engine Temperature

filtered_rdd.sortBy(lambda x: x[1], ascending = False).collect()

[[80, 194.0, 6.9], [75, 190.4, 7.0], [78, 185.0, 6.8], [70, 176.0, 8.2]]

In [27]:
# Step 4: Trigger Actions
# Take a sample of the data to verify transformations in the order of the data

sample_data = filtered_rdd.take(3)
sample_data

[[70, 176.0, 8.2], [80, 194.0, 6.9], [75, 190.4, 7.0]]

In [18]:
# Step 5: Aggregate Data
# Calculate total fuel efficiency using reduce

# map means take the fuel efficiency (column [2]) reduce, add all the numbers, finally the average

total_fuel_efficiency = filtered_rdd.map(lambda x: x[2]).reduce(lambda x, y: x + y)
average_fuel_efficiency = total_fuel_efficiency / filtered_rdd.count()

print("Average Fuel Efficiency of Trucks Going Over 65 MPH:", average_fuel_efficiency)

Average Fuel Efficiency of Trucks Going Over 65 MPH: 7.2250000000000005


In [28]:
# Step 6: Stop the Spark Context
sc.stop()