In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, sum
from pyspark.ml.feature import StringIndexer
from pyspark.sql import DataFrame
from pyspark.sql.types import IntegerType

In [None]:
spark = SparkSession.builder \
    .appName("MapReduce k-NN") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2000m") \
    .getOrCreate()


In [None]:
# Gọi API để lấy dữ liệu JSON
import requests
api_url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json"  # Thay thế bằng URL API thực tế
response = requests.get(api_url)

# Kiểm tra trạng thái phản hồi
if response.status_code == 200:
    # Tải dữ liệu JSON từ phản hồi
    data = response.json()  # Dữ liệu dưới dạng dictionary hoặc list

    # Chuyển đổi dữ liệu JSON thành DataFrame
    df = spark.read.json(spark.sparkContext.parallelize(data))

    # Hiển thị schema và một số dòng dữ liệu
    df.printSchema()
    df.show()
else:
    print(f"Lỗi khi gọi API: {response.status_code}")

root
 |-- _corrupt_record: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- collision_id: string (nullable = true)
 |-- contributing_factor_vehicle_1: string (nullable = true)
 |-- contributing_factor_vehicle_2: string (nullable = true)
 |-- contributing_factor_vehicle_3: string (nullable = true)
 |-- contributing_factor_vehicle_4: string (nullable = true)
 |-- contributing_factor_vehicle_5: string (nullable = true)
 |-- crash_date: string (nullable = true)
 |-- crash_time: string (nullable = true)
 |-- cross_street_name: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- human_address: string (nullable = true)
 |    |-- latitude: string (nullable = true)
 |    |-- longitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- number_of_cyclist_injured: string (nullable = true)
 |-- number_of_cyclist_killed: string (nullable = true)
 |-- number_of_motorist_injured: string (nullable = 

In [None]:
selected_columns = [
    "crash_date",
    "crash_time",
    "borough",
    "latitude",
    "longitude",
    "on_street_name",
    "off_street_name",
    "number_of_persons_injured",
    "number_of_persons_killed",
    "number_of_pedestrians_injured",
    "number_of_pedestrians_killed",
    "number_of_cyclist_injured",
    "number_of_cyclist_killed",
    "number_of_motorist_injured",
    "number_of_motorist_killed",
    "contributing_factor_vehicle_1",
    "contributing_factor_vehicle_2",
    "vehicle_type_code1",
    "vehicle_type_code2"
]

df_selected = df.select(*selected_columns)

In [None]:
# Đếm tổng số dòng
total_rows = df_selected.count()

# Đếm số dòng trùng lặp
duplicate_rows = df_selected.groupBy(df_selected.columns).count().filter("count > 1").count()

# Hiển thị kết quả
print(f"Tổng số dòng: {total_rows}")
print(f"Số dòng trùng lặp: {duplicate_rows}")


Tổng số dòng: 1000
Số dòng trùng lặp: 0


In [None]:
#Kiem tra du lieu thieu
missing_counts = df_selected.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_selected.columns])
missing_counts.show()

+----------+----------+-------+--------+---------+--------------+---------------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+------------------+------------------+
|crash_date|crash_time|borough|latitude|longitude|on_street_name|off_street_name|number_of_persons_injured|number_of_persons_killed|number_of_pedestrians_injured|number_of_pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|vehicle_type_code1|vehicle_type_code2|
+----------+----------+-------+--------+---------+--------------+---------------+-------------------------+------------------------+-----------------------------+----------------------------+---------------------

In [None]:
total_injuries = df_selected.selectExpr("sum(number_of_cyclist_injured) as total_injuries").collect()[0]["total_injuries"]
print("Total number of persons injured:", total_injuries)

Total number of persons injured: 38.0


In [None]:
total_kill = df_selected.selectExpr("sum(number_of_persons_killed) as total_kill").collect()[0]["total_kill"]
print("Total number of persons killed:", total_kill)

Total number of persons killed: 4.0


In [None]:
df_selected.show()

+--------------------+----------+---------+---------+----------+--------------------+--------------------+-------------------------+------------------------+-----------------------------+----------------------------+-------------------------+------------------------+--------------------------+-------------------------+-----------------------------+-----------------------------+--------------------+--------------------+
|          crash_date|crash_time|  borough| latitude| longitude|      on_street_name|     off_street_name|number_of_persons_injured|number_of_persons_killed|number_of_pedestrians_injured|number_of_pedestrians_killed|number_of_cyclist_injured|number_of_cyclist_killed|number_of_motorist_injured|number_of_motorist_killed|contributing_factor_vehicle_1|contributing_factor_vehicle_2|  vehicle_type_code1|  vehicle_type_code2|
+--------------------+----------+---------+---------+----------+--------------------+--------------------+-------------------------+----------------------

In [None]:
df_selected.printSchema()

root
 |-- crash_date: string (nullable = true)
 |-- crash_time: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- on_street_name: string (nullable = true)
 |-- off_street_name: string (nullable = true)
 |-- number_of_persons_injured: string (nullable = true)
 |-- number_of_persons_killed: string (nullable = true)
 |-- number_of_pedestrians_injured: string (nullable = true)
 |-- number_of_pedestrians_killed: string (nullable = true)
 |-- number_of_cyclist_injured: string (nullable = true)
 |-- number_of_cyclist_killed: string (nullable = true)
 |-- number_of_motorist_injured: string (nullable = true)
 |-- number_of_motorist_killed: string (nullable = true)
 |-- contributing_factor_vehicle_1: string (nullable = true)
 |-- contributing_factor_vehicle_2: string (nullable = true)
 |-- vehicle_type_code1: string (nullable = true)
 |-- vehicle_type_code2: string (nullable = true)



In [None]:
df_selected = df_selected.withColumn('is_injured',
                   when((col('number_of_persons_injured') + col('number_of_persons_killed')) > 0, 1).otherwise(0))

In [None]:
# Xoa nhung cot khong can thiet
cols_to_drop = ['latitude', 'longitude', 'on_street_name', 'off_street_name', 'number_of_persons_injured', 'number_of_pedestrians_killed',
                'number_of_cyclist_injured', 'number_of_motorist_injured', 'number_of_motorist_killed',
                'crash_date', 'crash_time', 'number_of_persons_killed', 'number_of_pedestrians_injured',
                'number_of_cyclist_killed']
df_dropped_cols = df_selected.drop(*cols_to_drop)

In [None]:
#Kiem tra du lieu thieu
missing_counts = df_dropped_cols.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_dropped_cols.columns])
missing_counts.show()

+-------+-----------------------------+-----------------------------+------------------+------------------+----------+
|borough|contributing_factor_vehicle_1|contributing_factor_vehicle_2|vehicle_type_code1|vehicle_type_code2|is_injured|
+-------+-----------------------------+-----------------------------+------------------+------------------+----------+
|    367|                            4|                          231|                15|               346|         0|
+-------+-----------------------------+-----------------------------+------------------+------------------+----------+



In [None]:
#Fill du lieu thieu bang gia tri mode
categorical_cols = ['contributing_factor_vehicle_1', 'contributing_factor_vehicle_2',
                    'vehicle_type_code1', 'vehicle_type_code2', 'borough']

# for col_name in categorical_cols:
#     # Get the mode value and bring it to the driver
#     mode_value = df_dropped_cols.groupBy(col_name).count().orderBy('count', ascending=False).first()[0]
#     # Fill missing values with the mode
#     df_dropped_cols = df_dropped_cols.fillna({col_name: mode_value})
for col_name in categorical_cols:
    # Lấy mode
    mode_row = df_dropped_cols.groupBy(col_name).count().orderBy('count', ascending=False).first()

    # Kiểm tra nếu mode_row và mode_value không phải là None
    if mode_row is not None and mode_row[0] is not None:
        mode_value = mode_row[0]
    else:
        # Nếu không có mode, sử dụng giá trị 0
        mode_value = 0

    # Điền giá trị thiếu bằng mode hoặc 0
    df_dropped_cols = df_dropped_cols.fillna({col_name: mode_value})


In [None]:
#Ma hoa du lieu bien phan loai
def encode_categorical_cols(df: DataFrame, categorical_cols: list) -> DataFrame:
    for col_name in categorical_cols:
        indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_encoded", handleInvalid='keep')
        df = indexer.fit(df).transform(df)
    return df

df_encoded = encode_categorical_cols(df_dropped_cols, categorical_cols)

In [None]:
df_final = df_encoded.drop(*categorical_cols)

In [None]:
# Kiểm tra lại lần nữa xem còn giá trị null không
missing_counts = df_final.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_final.columns])
missing_counts.show()

+----------+-------------------------------------+-------------------------------------+--------------------------+--------------------------+---------------+
|is_injured|contributing_factor_vehicle_1_encoded|contributing_factor_vehicle_2_encoded|vehicle_type_code1_encoded|vehicle_type_code2_encoded|borough_encoded|
+----------+-------------------------------------+-------------------------------------+--------------------------+--------------------------+---------------+
|         0|                                    0|                                    0|                         0|                         0|              0|
+----------+-------------------------------------+-------------------------------------+--------------------------+--------------------------+---------------+



In [None]:
# #Nếu còn dữ liệu null thì xóa các dòng đó đi
# df_final = df_final.drop(how ='any', thresh=None, subset=None)
# df_final.toPandas().info()

In [None]:
df_final.show()

+----------+-------------------------------------+-------------------------------------+--------------------------+--------------------------+---------------+
|is_injured|contributing_factor_vehicle_1_encoded|contributing_factor_vehicle_2_encoded|vehicle_type_code1_encoded|vehicle_type_code2_encoded|borough_encoded|
+----------+-------------------------------------+-------------------------------------+--------------------------+--------------------------+---------------+
|         1|                                 16.0|                                  0.0|                       0.0|                       1.0|            0.0|
|         1|                                 17.0|                                  0.0|                       0.0|                       0.0|            0.0|
|         0|                                  2.0|                                  0.0|                       0.0|                       5.0|            0.0|
|         0|                                  

In [None]:
df_final.drop_duplicates().count()

659

In [None]:
#Đưa ra thống kê cơ bản
df_final.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
is_injured,1000,0.344,0.4752787496721159,0,1
contributing_factor_vehicle_1_encoded,1000,4.308,5.616108677427517,0.0,34.0
contributing_factor_vehicle_2_encoded,1000,0.399,1.6919402062902997,0.0,16.0
vehicle_type_code1_encoded,1000,1.517,3.878240806313057,0.0,33.0
vehicle_type_code2_encoded,1000,2.092,4.167361623725842,0.0,34.0
borough_encoded,1000,1.403,1.4500667893906447,0.0,5.0


In [None]:
# Count duplicate rows by grouping all columns
duplicate_rows = df_final.groupBy(df_final.columns).count().filter("count > 1")

# Show the duplicates
duplicate_rows.show()

# Count the total number of duplicate rows
duplicate_count = duplicate_rows.count()
print(f"Number of duplicate rows: {duplicate_count}")


+----------+-------------------------------------+-------------------------------------+--------------------------+--------------------------+---------------+-----+
|is_injured|contributing_factor_vehicle_1_encoded|contributing_factor_vehicle_2_encoded|vehicle_type_code1_encoded|vehicle_type_code2_encoded|borough_encoded|count|
+----------+-------------------------------------+-------------------------------------+--------------------------+--------------------------+---------------+-----+
|         0|                                  1.0|                                  0.0|                       0.0|                       0.0|            3.0|    4|
|         1|                                  1.0|                                  0.0|                       0.0|                       0.0|            0.0|    6|
|         0|                                  0.0|                                  0.0|                       1.0|                       0.0|            1.0|    6|
|         

In [None]:
# Step 1: Convert PySpark DataFrame to Pandas DataFrame
df_pandas = df_final.toPandas()

# Step 2: Export to CSV file
df_pandas.to_csv("/content/df_final.csv", index=False)

# Step 3: Download the CSV file
from google.colab import files
files.download('/content/df_final.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>