In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, sum
from pyspark.ml.feature import StringIndexer
from pyspark.sql import DataFrame
from pyspark.sql.types import IntegerType

In [19]:

spark = SparkSession.builder \
    .appName("KafkaSparkStreaming") \
    .master("spark://spark-master:7077") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()


In [31]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType,TimestampType,DoubleType

schema = StructType([ 
    StructField("crash_date", TimestampType(), True),
    StructField("crash_time", StringType(), True), 
    StructField("on_street_name", StringType(), True), 
    StructField("off_street_name", StringType(), True), 
    StructField("cross_street_name", StringType(), True), 
    StructField("number_of_persons_injured", StringType(), True), 
    StructField("number_of_persons_killed", StringType(), True), 
    StructField("number_of_pedestrians_injured", StringType(), True), 
    StructField("number_of_pedestrians_killed", StringType(), True), 
    StructField("number_of_cyclist_injured", StringType(), True), 
    StructField("number_of_cyclist_killed", StringType(), True), 
    StructField("number_of_motorist_injured", StringType(), True), 
    StructField("number_of_motorist_killed", StringType(), True), 
    StructField("contributing_factor_vehicle_1", StringType(), True), 
    StructField("contributing_factor_vehicle_2", StringType(), True),
    StructField("contributing_factor_vehicle_3", StringType(), True), 
    StructField("contributing_factor_vehicle_4", StringType(), True), 
    StructField("contributing_factor_vehicle_5", StringType(), True), 
    StructField("collision_id", StringType(), True), 
    StructField("vehicle_type_code1", StringType(), True), 
    StructField("vehicle_type_code2", StringType(), True), 
    StructField("borough", StringType(), True), 
    StructField("zip_code", StringType(), True), 
    StructField("latitude", StringType(), True), 
    StructField("longitude", StringType(), True), 
    StructField("location", StructType([ 
        StructField("latitude", StringType(), True),
        StructField("longitude", StringType(), True), 
        StructField("human_address", StringType(), True) ]), True) ])

# Load the JSON file with the schema
df = spark.read.schema(schema).option("multiline", "true").json("hdfs://namenode:9000/raw_data/h9gi-nx95.json")

# Show the first few rows
df.show()


+-------------------+----------+--------------------+--------------------+--------------------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+--------------------+--------------------+---------+--------+---------+----------+--------------------+
|         crash_date|crash_time|      on_street_name|     off_street_name|   cross_street_name|number_of_persons_injured|number_of_persons_killed|number_of_pedestrians_injured|number_of_pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|contributing_factor_vehicle_3|contributing_factor_vehic

In [37]:
# Đếm tổng số dòng
total_rows = df.count()

# Đếm số dòng trùng lặp
duplicate_rows = df.groupBy(df.columns).count().filter("count > 1").count()

# Hiển thị kết quả
print(f"Tổng số dòng: {total_rows}")
print(f"Số dòng trùng lặp: {duplicate_rows}")


Tổng số dòng: 1000
Số dòng trùng lặp: 0


In [38]:
#Kiem tra du lieu thieu
missing_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
missing_counts.show()

+----------+----------+--------------+---------------+-----------------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-----------------------------+------------+------------------+------------------+-------+--------+--------+---------+--------+
|crash_date|crash_time|on_street_name|off_street_name|cross_street_name|number_of_persons_injured|number_of_persons_killed|number_of_pedestrians_injured|number_of_pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|contributing_factor_vehicle_3|contributing_factor_vehicle_4|contributing_factor_vehicle_5|collision_id|vehicle_type_code1

In [39]:
# total_injuries = df.selectExpr("sum(number_of_cyclist_injured) as total_injuries").collect()[0]["total_injuries"]
# print("Total number of persons injured:", total_injuries)
df = df.withColumn("number_of_persons_injured", df["number_of_persons_injured"].cast("int")) # Calculate the total number of persons injured 
total_injured = df.agg({"number_of_persons_injured": "sum"}).collect()[0][0]
total_injured

454

In [36]:
# total_kill = df.selectExpr("sum(number_of_persons_killed) as total_kill").collect()[0]["total_kill"]
# print("Total number of persons killed:", total_kill)
df = df.withColumn("number_of_persons_killed", df["number_of_persons_killed"].cast("int")) 
total_killed = df.agg({"number_of_persons_killed": "sum"}).collect()[0][0]
total_killed

4

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
df = df.withColumn('is_injured',
                   when((col('number_of_persons_injured') + col('number_of_persons_killed')) > 0, 1).otherwise(0))

In [None]:
# Xoa nhung cot khong can thiet
cols_to_drop = ['latitude', 'longitude', 'on_street_name', 'off_street_name', 'number_of_persons_injured', 'number_of_pedestrians_killed',
                'number_of_cyclist_injured', 'number_of_motorist_injured', 'number_of_motorist_killed',
                'vehicle_type_code3', 'vehicle_type_code4', 'vehicle_type_code5',
                'crash_date', 'crash_time', 'number_of_persons_killed', 'number_of_pedestrians_injured',
                'number_of_cyclist_killed']
df_dropped_cols = df.drop(*cols_to_drop)

In [None]:
#Kiem tra du lieu thieu
missing_counts = df_dropped_cols.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_dropped_cols.columns])
missing_counts.show()

In [None]:
#Fill du lieu thieu bang gia tri mode
categorical_cols = ['contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
                    'vehicle_type_code1', 'vehicle_type_code2', 'borough']

# for col_name in categorical_cols:
#     # Get the mode value and bring it to the driver
#     mode_value = df_dropped_cols.groupBy(col_name).count().orderBy('count', ascending=False).first()[0]
#     # Fill missing values with the mode
#     df_dropped_cols = df_dropped_cols.fillna({col_name: mode_value})
for col_name in categorical_cols:
    # Lấy mode
    mode_row = df_dropped_cols.groupBy(col_name).count().orderBy('count', ascending=False).first()
    
    # Kiểm tra nếu mode_row và mode_value không phải là None
    if mode_row is not None and mode_row[0] is not None:
        mode_value = mode_row[0]
    else:
        # Nếu không có mode, sử dụng giá trị 0
        mode_value = 0
    
    # Điền giá trị thiếu bằng mode hoặc 0
    df_dropped_cols = df_dropped_cols.fillna({col_name: mode_value})


In [None]:
#Ma hoa du lieu bien phan loai
def encode_categorical_cols(df: DataFrame, categorical_cols: list) -> DataFrame:
    for col_name in categorical_cols:
        indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_encoded", handleInvalid='keep')
        df = indexer.fit(df).transform(df)
    return df

df_encoded = encode_categorical_cols(df_dropped_cols, categorical_cols)

In [None]:
df_final = df_encoded.drop(*categorical_cols)

In [None]:
# Kiểm tra lại lần nữa xem còn giá trị null không
missing_counts = df_final.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_final.columns])
missing_counts.show()

In [None]:
# #Nếu còn dữ liệu null thì xóa các dòng đó đi
# df_final = df_final.drop(how ='any', thresh=None, subset=None)
# df_final.toPandas().info()

In [None]:
df_final.show()

In [None]:
#Đưa ra thống kê cơ bản
df_final.describe().toPandas().transpose()

In [None]:
# feature
x = df_final.drop("is_injured")
# label
y = df_final.select("is_injured")