In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, sum
from pyspark.ml.feature import StringIndexer
from pyspark.sql import DataFrame
from pyspark.sql.types import IntegerType

In [2]:

spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .master("spark://spark-master:7077") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()


In [3]:
df = spark.read.parquet("hdfs://namenode:9000/raw_data")

In [4]:
# df = spark.read.parquet("hdfs://namenode:9000/raw_data")

In [5]:
df.show()

+--------------------+----------+---------+--------+---------+--------------------+------------------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+--------------------+--------------------+------------------+------------------+------------------+
|          crash_date|crash_time|  borough|latitude|longitude|      on_street_name|   off_street_name|number_of_persons_injured|number_of_persons_killed|number_of_pedestrians_injured|number_of_pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|  vehicle_type_code1|  vehicle_type_code2|vehicle_type_code3|vehicle_type_code4|vehicle_type_code5|
+--------------------+----------+---------+-----

In [6]:
df.printSchema()

root
 |-- crash_date: string (nullable = true)
 |-- crash_time: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- on_street_name: string (nullable = true)
 |-- off_street_name: string (nullable = true)
 |-- number_of_persons_injured: integer (nullable = true)
 |-- number_of_persons_killed: integer (nullable = true)
 |-- number_of_pedestrians_injured: integer (nullable = true)
 |-- number_of_pedestrians_killed: integer (nullable = true)
 |-- number_of_cyclist_injured: integer (nullable = true)
 |-- number_of_cyclist_killed: integer (nullable = true)
 |-- number_of_motorist_injured: integer (nullable = true)
 |-- number_of_motorist_killed: integer (nullable = true)
 |-- contributing_factor_vehicle_1: string (nullable = true)
 |-- contributing_factor_vehicle_2: string (nullable = true)
 |-- vehicle_type_code1: string (nullable = true)
 |-- vehicle_type_code2: string (nullable = true)
 |-- veh

In [7]:
df = df.withColumn('is_injured',
                   when((col('number_of_persons_injured').cast(IntegerType()) +
                         col('number_of_persons_killed').cast(IntegerType())) > 0, 1).otherwise(0))

In [8]:
# Xoa nhung cot khong can thiet
cols_to_drop = ['latitude', 'longitude', 'on_street_name', 'off_street_name', 'number_of_persons_injured', 'number_of_pedestrians_killed',
                'number_of_cyclist_injured', 'number_of_motorist_injured', 'number_of_motorist_killed',
                'vehicle_type_code_3', 'vehicle_type_code_4', 'vehicle_type_code_5',
                'crash_date', 'crash_time', 'number_of_persons_killed', 'number_of_pedestrians_injured',
                'number_of_cyclist_killed']
df_dropped_cols = df.drop(*cols_to_drop)

In [9]:
#Kiem tra du lieu thieu
missing_counts = df_dropped_cols.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_dropped_cols.columns])
missing_counts.show()

+-------+-----------------------------+-----------------------------+------------------+------------------+------------------+------------------+------------------+----------+
|borough|contributing_factor_vehicle_1|contributing_factor_vehicle_2|vehicle_type_code1|vehicle_type_code2|vehicle_type_code3|vehicle_type_code4|vehicle_type_code5|is_injured|
+-------+-----------------------------+-----------------------------+------------------+------------------+------------------+------------------+------------------+----------+
|     34|                            0|                           56|                 0|                67|               198|               198|               198|         0|
+-------+-----------------------------+-----------------------------+------------------+------------------+------------------+------------------+------------------+----------+



In [17]:
#Fill du lieu thieu bang gia tri mode
categorical_cols = ['contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
                    'vehicle_type_code1', 'vehicle_type_code2', 'borough']

# for col_name in categorical_cols:
#     # Get the mode value and bring it to the driver
#     mode_value = df_dropped_cols.groupBy(col_name).count().orderBy('count', ascending=False).first()[0]
#     # Fill missing values with the mode
#     df_dropped_cols = df_dropped_cols.fillna({col_name: mode_value})
for col_name in categorical_cols:
    # Lấy mode
    mode_row = df_dropped_cols.groupBy(col_name).count().orderBy('count', ascending=False).first()
    
    # Kiểm tra nếu mode_row và mode_value không phải là None
    if mode_row is not None and mode_row[0] is not None:
        mode_value = mode_row[0]
    else:
        # Nếu không có mode, sử dụng giá trị 0
        mode_value = 0
    
    # Điền giá trị thiếu bằng mode hoặc 0
    df_dropped_cols = df_dropped_cols.fillna({col_name: mode_value})


In [18]:
#Ma hoa du lieu bien phan loai
def encode_categorical_cols(df: DataFrame, categorical_cols: list) -> DataFrame:
    for col_name in categorical_cols:
        indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_encoded", handleInvalid='keep')
        df = indexer.fit(df).transform(df)
    return df

df_encoded = encode_categorical_cols(df_dropped_cols, categorical_cols)

In [19]:
df_final = df_encoded.drop(*categorical_cols)

In [20]:
# feature
x = df_final.drop("is_injured")
# label
y = df_final.select("is_injured")