In [0]:
path= '/mnt/gold/gold/'
df = spark.read.format('delta').load(path)

In [0]:
df_pd = df.toPandas()

# Preview the dataset
print(df_pd.head())

Use a histogram to understand the distribution of the is_injured variable.

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot target distribution
sns.countplot(x='is_injured', data=df_pd)
plt.title('Distribution of Target Variable (is_injured)')
plt.show()


Correlation Heatmap:
Check correlations between features to see how they relate to is_injured.


In [0]:
import numpy as np

# Compute the correlation matrix
corr_matrix = df_pd.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", mask=np.triu(corr_matrix))
plt.title('Correlation Heatmap')
plt.show()


Biểu đồ heatmap thể hiện hệ số tương quan giữa các cặp biến, với giá trị dao động từ -1 đến 1:
+1: Tương quan dương hoàn hảo (khi một biến tăng thì biến kia cũng tăng theo tỷ lệ tuyến tính).
-1: Tương quan âm hoàn hảo (khi một biến tăng thì biến kia giảm theo tỷ lệ tuyến tính).
0: Không có tương quan tuyến tính giữa hai biến.

Tương quan giữa is_injured và hầu hết các biến khác khá yếu (gần bằng 0), cho thấy không có mối quan hệ tuyến tính mạnh mẽ.
Mối tương quan cao nhất là với biến contributing_factor_vehicle_2_encoded (khoảng 0.11), điều này gợi ý rằng biến này có thể có ảnh hưởng nhẹ đến việc dự đoán is_injured.
Có tương quan mạnh (0.46) giữa hai biến contributing_factor_vehicle_4_encoded và contributing_factor_vehicle_3_encoded. Điều này cho thấy khả năng đa cộng tuyến (multicollinearity), có thể gây ảnh hưởng tiêu cực đến hiệu suất của mô hình.

 Box Plots for Key Features

In [0]:
features = [
    'contributing_factor_vehicle_1_encoded', 'contributing_factor_vehicle_2_encoded',
    'vehicle_type_code1_encoded', 'borough_encoded'
]

# Create box plots
plt.figure(figsize=(15, 10))
for i, col in enumerate(features, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x='is_injured', y=col, data=df_pd)
    plt.title(f'Box Plot of {col}')
plt.tight_layout()
plt.show()


In [0]:
from sklearn.model_selection import train_test_split
# Select the required columns and convert to Pandas
required_columns = [
    'is_injured', 'contributing_factor_vehicle_1_encoded', 'contributing_factor_vehicle_2_encoded',
    'contributing_factor_vehicle_3_encoded', 'contributing_factor_vehicle_4_encoded',
    'contributing_factor_vehicle_5_encoded', 'vehicle_type_code1_encoded', 
    'vehicle_type_code2_encoded', 'borough_encoded'
]
df_pd = df.select(required_columns).toPandas()

# Define features and target
X = df_pd.drop(columns=['is_injured'])
y = df_pd['is_injured']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Random Forest Model

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
rf_predictions = rf_model.predict(X_test)
print("Random Forest Model Performance:")
print(classification_report(y_test, rf_predictions))


In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(y_test, rf_predictions)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["No Injury", "Injury"], yticklabels=["No Injury", "Injury"])
plt.title("Random Forest Confusion Matrix")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


visualize the importance of each feature in making predictions

In [0]:
# Get feature importances
importances = rf_model.feature_importances_
features = X.columns

# Create a bar chart for feature importance
plt.figure(figsize=(10, 6))
plt.barh(features, importances, color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.show()


XGBoost Model


In [0]:
!pip install xgboost


In [0]:
from xgboost import XGBClassifier

# Train XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
xgb_predictions = xgb_model.predict(X_test)
print("XGBoost Model Performance:")
print(classification_report(y_test, xgb_predictions))


In [0]:
# Generate confusion matrix for XGBoost
xgb_cm = confusion_matrix(y_test, xgb_predictions)

# Plot confusion matrix for XGBoost
plt.figure(figsize=(6, 4))
sns.heatmap(xgb_cm, annot=True, fmt='d', cmap='Blues', xticklabels=["No Injury", "Injury"], yticklabels=["No Injury", "Injury"])
plt.title("XGBoost Confusion Matrix")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

 Feature Importance

In [0]:
# Get feature importance from XGBoost model
xgb_importances = xgb_model.feature_importances_
features = X.columns

# Create a bar chart for feature importance from XGBoost
plt.figure(figsize=(10, 6))
plt.barh(features, xgb_importances, color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance (XGBoost)')
plt.show()


Evaluate and Compare Results

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Store performance metrics for both models
model_comparison_metrics = {
    'Model': ['Random Forest', 'XGBoost'],
    'Accuracy': [accuracy_score(y_test, rf_predictions), xgb_accuracy],
    'Precision': [precision_score(y_test, rf_predictions), xgb_precision],
    'Recall': [recall_score(y_test, rf_predictions), xgb_recall],
    'F1 Score': [f1_score(y_test, rf_predictions), xgb_f1]
}

# Convert to DataFrame for easier plotting
comparison_df = pd.DataFrame(model_comparison_metrics)

# Plot model comparison
ax = comparison_df.set_index('Model').plot(kind='bar', figsize=(10, 6), colormap='viridis')

# Add metric values on top of the bars
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height:.2f}', 
                xy=(p.get_x() + p.get_width() / 2, height), 
                xytext=(0, 8),  # 8 points vertical offset
                textcoords='offset points', 
                ha='center', va='bottom', fontsize=10, color='black')

plt.title('Random Forest vs XGBoost Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


ROC Curve Comparison


In [0]:
from sklearn.metrics import roc_curve, auc

# Function to plot ROC Curve for any model
def plot_roc_curve(model, X_test, y_test, label):
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

# Plot ROC for Random Forest
plt.figure(figsize=(8, 6))
plot_roc_curve(rf_model, X_test, y_test, 'Random Forest')

# Plot ROC for XGBoost
plot_roc_curve(xgb_model, X_test, y_test, 'XGBoost')

# Plot random classifier line
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison: Random Forest vs XGBoost')
plt.legend(loc='lower right')
plt.show()


AUC > 0.5 and < 1: The model has some ability to distinguish between positive and negative classes. The higher the AUC, the better the model.

Save the Models

In [0]:
import joblib

# Save Random Forest model
joblib.dump(rf_model, 'random_forest_model.pkl')

# Save XGBoost model
joblib.dump(xgb_model, 'xgboost_model.pkl')

print("Models saved successfully!")

