In [1]:
import findspark

In [2]:
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.builder.appName("Severity Models").config("spark.executor.memory", "8g").config("spark.driver.memory", "8g").getOrCreate()

24/11/27 20:30:22 WARN Utils: Your hostname, DESKTOP-FM4F1HL resolves to a loopback address: 127.0.1.1; using 172.31.204.238 instead (on interface eth0)
24/11/27 20:30:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/27 20:30:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
path_to_data = "results/cleaned_accidents"
df = spark.read.csv(path_to_data, header=False, inferSchema=True)

                                                                                

In [5]:
df = df.toDF("ID", "Severity", "Year", "Month", "Hour","City", "State", "Temperature", "Humidity", "Pressure", "Visibility", "Wind_Speed", "Precipitation", "Weather_Category", "Traffic_Signal")

In [6]:
df = df.drop("ID")

### Logistic Regression Model 

In [7]:
string_cols = ["City", "State", "Weather_Category"]
stages = []

for column in string_cols:
    indexer = StringIndexer(inputCol=column, outputCol=f"{column}Index", handleInvalid="skip")
    stages += [indexer]

In [8]:
numerical_cols = ["Month", "Hour", "Temperature", "Humidity", "Pressure", "Visibility", "Wind_Speed", "Precipitation", "Traffic_Signal"]
feature_cols = [f"{col}Index" for col in string_cols] + numerical_cols

In [9]:
base_stages = stages.copy()
assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
stages.append(assembler)

In [10]:
log_reg = LogisticRegression(featuresCol='features', labelCol='Severity', family="multinomial")
stages.append(log_reg)

In [11]:
pipeline = Pipeline(stages=stages)

In [12]:
train_data, test_data = df.randomSplit([0.8, 0.2])
test_data_dt = test_data.select("*")
test_data_nb = test_data.select("*")
test_data_rf = test_data.select("*")
lr_model = pipeline.fit(train_data)
predictions = lr_model.transform(test_data)

24/11/27 20:30:50 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [13]:
lr_accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="accuracy"
)
lr_accuracy = lr_accuracy_evaluator.evaluate(predictions)

lr_precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="weightedPrecision"
)
lr_precision = lr_precision_evaluator.evaluate(predictions)

lr_recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="weightedRecall"
)
lr_recall = lr_recall_evaluator.evaluate(predictions)

print(f"lr_Accuracy: {lr_accuracy}\nlr_Precision: {lr_precision}\nlr_Recall: {lr_recall}")



lr_Accuracy: 0.8411467780755443
lr_Precision: 0.7477137470393347
lr_Recall: 0.8411467780755443


                                                                                

### Decision Tree Model with 5 Depth

In [14]:
decision_tree_stages = base_stages.copy()
feature_columns_dt = [f for f in feature_cols if f not in ["CityIndex"]]
assembler = VectorAssembler(inputCols=feature_columns_dt, outputCol="features")
dec_tree = DecisionTreeClassifier(featuresCol="features", labelCol="Severity", predictionCol="dt_prediction", maxDepth=5, maxBins=100)
decision_tree_stages.append(assembler)
decision_tree_stages.append(dec_tree)

In [15]:
dec_tree_pipeline = Pipeline(stages=decision_tree_stages)

In [16]:
dec_tree_model = dec_tree_pipeline.fit(train_data)
dec_tree_predictions = dec_tree_model.transform(test_data_dt)

                                                                                

In [17]:
dt_tree_accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="dt_prediction", metricName="accuracy"
)
dt_tree_accuracy = dt_tree_accuracy_evaluator.evaluate(dec_tree_predictions)

dt_tree_precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="dt_prediction", metricName="weightedPrecision"
)
dt_tree_precision = dt_tree_precision_evaluator.evaluate(dec_tree_predictions)

dt_tree_recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="dt_prediction", metricName="weightedRecall"
)
dt_tree_recall = dt_tree_recall_evaluator.evaluate(dec_tree_predictions)

print(f"dt_tree_Accuracy: {dt_tree_accuracy}, \ndt_tree_Precision: {dt_tree_precision}, \ndt_tree_Recall: {dt_tree_recall}")



dt_tree_Accuracy: 0.8412394237633264, dt_tree_Precision: 0.7708548205361282, dt_tree_Recall: 0.8412394237633263


                                                                                

### Random Forest

In [18]:
random_forest_stages = base_stages.copy()
feature_columns_rf = [f for f in feature_cols if f not in ["CityIndex"]]
assembler = VectorAssembler(inputCols=feature_columns_rf, outputCol="features")
rf = RandomForestClassifier(featuresCol="features", predictionCol="rf_prediction", labelCol="Severity", numTrees=20, maxDepth=8, maxBins=100)
random_forest_stages.append(assembler)
random_forest_stages.append(rf)

In [19]:
rf_pipeline = Pipeline(stages=random_forest_stages)
rf_model = rf_pipeline.fit(train_data)
rf_predictions = rf_model.transform(test_data_rf)

24/11/27 20:32:43 WARN DAGScheduler: Broadcasting large task binary with size 1125.5 KiB
24/11/27 20:32:46 WARN DAGScheduler: Broadcasting large task binary with size 1681.7 KiB
                                                                                

In [20]:
rf_accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="rf_prediction", metricName="accuracy"
)
rf_accuracy = rf_accuracy_evaluator.evaluate(rf_predictions)

rf_precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="rf_prediction", metricName="weightedPrecision"
)
rf_precision = rf_precision_evaluator.evaluate(rf_predictions)

rf_recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="rf_prediction", metricName="weightedRecall"
)
rf_recall = rf_recall_evaluator.evaluate(rf_predictions)

print(f"rf_Accuracy: {rf_accuracy}\nrf_Precision: {rf_precision}\nrf_Recall: {rf_recall}")

24/11/27 20:32:50 WARN DAGScheduler: Broadcasting large task binary with size 1035.2 KiB
24/11/27 20:32:56 WARN DAGScheduler: Broadcasting large task binary with size 1035.2 KiB
24/11/27 20:33:01 WARN DAGScheduler: Broadcasting large task binary with size 1035.2 KiB

rf_Accuracy: 0.8412755068206731
rf_Precision: 0.7902760657220946
rf_Recall: 0.841275506820673


                                                                                

In [21]:
feature_importances_rf = rf_model.stages[-1].featureImportances
for index, feature in enumerate(feature_importances_rf):
    print(f"Feature {feature_columns_rf[index]} has importance: {feature}")

Feature StateIndex has importance: 0.6037848106498739
Feature Weather_CategoryIndex has importance: 0.06833850694167891
Feature Month has importance: 0.12970612363723588
Feature Hour has importance: 0.02085366264488215
Feature Temperature has importance: 0.008489094352004855
Feature Humidity has importance: 0.004754051844682844
Feature Pressure has importance: 0.03475353256868155
Feature Visibility has importance: 0.0059660196050164245
Feature Wind_Speed has importance: 0.005004625531001442
Feature Precipitation has importance: 0.04154513892238491
Feature Traffic_Signal has importance: 0.07680443330255714


### Naive Bayes

In [24]:
from pyspark.sql.functions import col

In [25]:
train_data_nb = train_data.select("*")

In [26]:
min_temp = train_data_nb.agg(functions.min("Temperature")).collect()[0][0]
min_temp = float(min_temp)
train_data_nb = train_data_nb.withColumn("Temperature", col("Temperature") + abs(min_temp))
test_data_nb = test_data_nb.withColumn("Temperature", col("Temperature") + abs(min_temp))

                                                                                

In [27]:
naive_bayes_stages = base_stages.copy()
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
nb = NaiveBayes(featuresCol="features", labelCol="Severity", modelType="multinomial", predictionCol="nb_prediction")
naive_bayes_stages.append(assembler)
naive_bayes_stages.append(nb)

In [28]:
nb_pipeline = Pipeline(stages=naive_bayes_stages)
nb_model = nb_pipeline.fit(train_data_nb)
nb_predictions = nb_model.transform(test_data_nb)

                                                                                

In [30]:
nb_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="nb_prediction", metricName="accuracy"
)
nb_accuracy = nb_evaluator.evaluate(nb_predictions)

nb_precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="nb_prediction", metricName="weightedPrecision"
)
nb_precision = nb_precision_evaluator.evaluate(nb_predictions)

nb_recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="nb_prediction", metricName="weightedRecall"
)
nb_recall = nb_recall_evaluator.evaluate(nb_predictions)

print(f"nb_Accuracy: {nb_accuracy}\nnb_Precision: {nb_precision}\nnb_Recall: {nb_recall}")



nb_Accuracy: 0.060084141788861456
nb_Precision: 0.6899526430602134
nb_Recall: 0.060084141788861456


                                                                                