In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

In [None]:
# Créer une session Spark
spark = SparkSession.builder.appName("DataCleaningWithSpark").getOrCreate()

In [3]:
# 1. Lire le fichier CSV avec Spark
data_spark = spark.read.csv("Road Accident Data.csv", header=True, inferSchema=True)


                                                                                

In [4]:
# Afficher les premières lignes pour vérifier les données
data_spark.show(10)

+--------------------+-------------+-----------+--------------------+--------------------+-----------------+---------+--------------------+--------------------------+-------------------+---------+--------------------+------------------+-------------------+-----------------------+------------------+-----------+-------------------+-------------------+--------------------+------------+
|      Accident_Index|Accident Date|Day_of_Week|    Junction_Control|     Junction_Detail|Accident_Severity| Latitude|    Light_Conditions|Local_Authority_(District)|Carriageway_Hazards|Longitude|Number_of_Casualties|Number_of_Vehicles|       Police_Force|Road_Surface_Conditions|         Road_Type|Speed_limit|               Time|Urban_or_Rural_Area|  Weather_Conditions|Vehicle_Type|
+--------------------+-------------+-----------+--------------------+--------------------+-----------------+---------+--------------------+--------------------------+-------------------+---------+--------------------+-----------

In [5]:
#Supprimer les colonnes inutiles (Carriageway_Hazards, Weather_Conditions)
data_spark = data_spark.drop("Carriageway_Hazards", "Weather_Conditions")

In [6]:
# Supprimer les lignes avec des valeurs manquantes dans des colonnes spécifiques
data_cleaned = data_spark.dropna(subset=["Road_Surface_Conditions", "Road_Type", "Time"])

In [7]:
# Vérifier les NaN restants
print("Vérifier les NaN restants dans le DataFrame :")
data_cleaned.select([col(c).isNull().alias(c) for c in data_cleaned.columns]).show()

Vérifier les NaN restants dans le DataFrame :
+--------------+-------------+-----------+----------------+---------------+-----------------+--------+----------------+--------------------------+---------+--------------------+------------------+------------+-----------------------+---------+-----------+-----+-------------------+------------+
|Accident_Index|Accident Date|Day_of_Week|Junction_Control|Junction_Detail|Accident_Severity|Latitude|Light_Conditions|Local_Authority_(District)|Longitude|Number_of_Casualties|Number_of_Vehicles|Police_Force|Road_Surface_Conditions|Road_Type|Speed_limit| Time|Urban_or_Rural_Area|Vehicle_Type|
+--------------+-------------+-----------+----------------+---------------+-----------------+--------+----------------+--------------------------+---------+--------------------+------------------+------------+-----------------------+---------+-----------+-----+-------------------+------------+
|         false|        false|      false|           false|          

In [8]:
# Dictionnaire de mapping pour plusieurs colonnes
column_mappings = {
    "Day_of_Week": {
        "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4,
        "Friday": 5, "Saturday": 6, "Sunday": 7
    },
    "Junction_Control": {
        "Authorised person": 1, "Auto traffic signal": 1, "Stop sign": 1,
        "Auto traffic sigl": 1,"Give way or uncontrolled": 2,
          "Not at junction or within 20 metres": 2,
        "Data missing or out of range": 3
    },
    "Junction_Detail": { 
       "Roundabout": 1, "Mini-roundabout": 1, "Crossroads": 2, 
       "More than 4 arms (not roundabout)": 2, "T or staggered junction": 3, "Slip road": 4,
       "Private drive or entrance": 4, "Other junction": 5, "Not at junction or within 20 metres": 6
    },
    "Accident_Severity": {
        "Slight": 1, "Serious": 2, "Fetal": 3, "Fatal": 3
    },
    "Light_Conditions": { 
        "Darkness - lighting unknown": 0, "Darkness - no lighting": 0, "Darkness - lights unlit": 0, 
        "Darkness - lights lit": 1, "Daylight": 2
    },
    "Road_Surface_Conditions": {
        "Dry": 1,  "Wet or damp": 2, "Snow": 3,
          "Frost or ice": 4,  "Flood over 3cm. deep": 5
    },
    "Road_Type": {
        "Slip road": 1, "Single carriageway": 2, 
        "One way street": 3, "Roundabout": 4, "Dual carriageway": 5

    },
    "Urban_or_Rural_Area": {
        "Urban": 1, "Rural": 0
    },
    "Vehicle_Type": {
          "Pedal cycle": 1,  "Ridden horse": 1,  "Motorcycle 50cc and under": 1,  
          "Motorcycle 125cc and under": 1,  "Motorcycle over 125cc and up to 500cc": 1,  
          "Motorcycle over 500cc": 1,  "Car": 2,  "Taxi/Private hire car": 2,  
          "Minibus (8 - 16 passenger seats)": 2,  "Van / Goods 3.5 tonnes mgw or under": 3,  
          "Agricultural vehicle": 3,  "Goods over 3.5t. and under 7.5t": 4,  
          "Bus or coach (17 or more pass seats)": 4,  "Goods 7.5 tonnes mgw and over": 4,  
          "Other vehicle": 5  
    }
}

In [9]:
from pyspark.sql.functions import create_map, lit, col

In [10]:
# Appliquer les mappings directement sur les colonnes existantes
for column, mapping in column_mappings.items():
    # Créez une expression "when" pour effectuer la transformation
    expression = None
    for key, value in mapping.items():
        if expression is None:
            expression = when(col(column) == key, value)
        else:
            expression = expression.when(col(column) == key, value)
    
    # Ajoutez une gestion des valeurs non mappées (optionnelle)
    expression = expression.otherwise(None)  # Ou -1 si vous voulez une valeur par défaut
    
    # Remplacez la colonne par la version transformée
    data_cleaned = data_cleaned.withColumn(column, expression)

In [11]:
# Afficher quelques exemples
data_cleaned.select(list(column_mappings.keys())).show()


+-----------+----------------+---------------+-----------------+----------------+-----------------------+---------+-------------------+------------+
|Day_of_Week|Junction_Control|Junction_Detail|Accident_Severity|Light_Conditions|Road_Surface_Conditions|Road_Type|Urban_or_Rural_Area|Vehicle_Type|
+-----------+----------------+---------------+-----------------+----------------+-----------------------+---------+-------------------+------------+
|          1|               3|              6|                1|               2|                      1|        5|                  1|           2|
|          3|               2|              1|                1|               1|                      2|        4|                  1|           2|
|          3|               3|              6|                3|               2|                      1|        2|                  1|           2|
|          3|               3|              6|                1|               0|                      1| 

In [12]:
# Renommer la colonne Accident_Index en AccidentIndex
data_cleaned = data_cleaned.withColumnRenamed("Accident_Index", "AccidentIndex")


In [13]:
# (Optionnel) Afficher les premières lignes pour vérifier le changement
data_cleaned.show()

+--------------------+-------------+-----------+----------------+---------------+-----------------+---------+----------------+--------------------------+---------+--------------------+------------------+-------------------+-----------------------+---------+-----------+-------------------+-------------------+------------+
|       AccidentIndex|Accident Date|Day_of_Week|Junction_Control|Junction_Detail|Accident_Severity| Latitude|Light_Conditions|Local_Authority_(District)|Longitude|Number_of_Casualties|Number_of_Vehicles|       Police_Force|Road_Surface_Conditions|Road_Type|Speed_limit|               Time|Urban_or_Rural_Area|Vehicle_Type|
+--------------------+-------------+-----------+----------------+---------------+-----------------+---------+----------------+--------------------------+---------+--------------------+------------------+-------------------+-----------------------+---------+-----------+-------------------+-------------------+------------+
|132b02e7-ef77-438...|   2021-0

In [None]:
from uuid import uuid4

for idx, row in data_cleaned.iterrows():
    if 'E+' in str(row['Accident_Index']):  # Vérifier si 'E+' est une sous-chaîne
        data_cleaned.at[idx, 'Accident_Index'] = str(uuid4())  # Convertir UUID en chaîne

# Enregistrer le DataFrame modifié dans un fichier CSV
data_cleaned.to_csv('Road Accident Data.csv', index=False)

In [14]:
# Chemins pour sauvegarder les fichiers
output_csv_path = 'road clean.csv'
output_parquet_path ='road clean.parquet'

# Enregistrer au format CSV
data_cleaned.coalesce(1).write.mode("overwrite").option("header", "true").csv(output_csv_path)

# Enregistrer au format Parquet
# Écriture des données en un seul fichier Parquet
data_cleaned.coalesce(1).write.mode("overwrite").parquet(output_parquet_path)


print("Les fichiers ont été sauvegardés en CSV et Parquet avec succès.")


                                                                                

24/12/10 20:15:04 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1045158 ms exceeds timeout 120000 ms
24/12/10 20:15:04 WARN SparkContext: Killing executors is not supported by current scheduler.
24/12/10 20:32:09 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$