<a href="https://colab.research.google.com/github/brotheramin/MachineLearning/blob/main/IndustrialAccidentsProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
data_path = "/content/IHMStefanini_industrial_safety_and_health_database.csv"  # Update this with the correct path
merged_data = pd.read_csv(data_path)

# Check column names to match them correctly
print("Columns in the dataset:")
print(merged_data.columns)

# Preprocessing: Encode categorical variables
categorical_cols = ['Countries', 'Local', 'Industry Sector', 'Genre', 'Employee ou Terceiro', 'Risco Critico']
label_encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
    if col in merged_data.columns:
        merged_data[col] = label_encoders[col].fit_transform(merged_data[col])
    else:
        print(f"Column {col} not found in the dataset.")

# Encode the "Potential Accident Level" column
if 'Potential Accident Level' in merged_data.columns:
    merged_data['Potential Accident Level'] = label_encoders['Risco Critico'].fit_transform(merged_data['Potential Accident Level'])
else:
    print("Column 'Potential Accident Level' not found in the dataset.")

# Encode the target variable (Accident Level)
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(merged_data['Accident Level'])

# Define features and target
columns_to_drop = ['Accident Level', 'Data']
columns_to_drop = [col for col in columns_to_drop if col in merged_data.columns]  # Only drop existing columns
X = merged_data.drop(columns=columns_to_drop)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Random Forest classifier with class weights to handle imbalance
rf_model_weighted = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
rf_model_weighted.fit(X_train, y_train)

# Predictions and evaluation
y_pred_weighted = rf_model_weighted.predict(X_test)
report_weighted = classification_report(y_test, y_pred_weighted, target_names=label_encoder_y.classes_)
conf_matrix_weighted = confusion_matrix(y_test, y_pred_weighted)

# Print the results
print("Classification Report:")
print(report_weighted)
print("\nConfusion Matrix:")
print(conf_matrix_weighted)


Columns in the dataset:
Index(['Data', 'Countries', 'Local', 'Industry Sector', 'Accident Level',
       'Potential Accident Level', 'Genre', 'Employee ou Terceiro',
       'Risco Critico'],
      dtype='object')
Classification Report:
              precision    recall  f1-score   support

           I       0.79      0.79      0.79        66
          II       0.14      0.12      0.13         8
         III       0.20      0.33      0.25         6
          IV       0.25      0.17      0.20         6
           V       0.00      0.00      0.00         2

    accuracy                           0.64        88
   macro avg       0.28      0.28      0.27        88
weighted avg       0.63      0.64      0.63        88


Confusion Matrix:
[[52  3  7  3  1]
 [ 7  1  0  0  0]
 [ 3  1  2  0  0]
 [ 3  1  1  1  0]
 [ 1  1  0  0  0]]
