<a href="https://colab.research.google.com/github/brotheramin/MachineLearning/blob/main/SafetyProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install xgboost if not already installed
!pip install xgboost

# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# Load the dataset
df = pd.read_csv('IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv')

# Data cleaning: Drop unnecessary columns and extract date-related features
df_clean = df.drop(columns=['Unnamed: 0'])  # Dropping the index column if it exists

# Convert the date to datetime and extract Year, Month, and Day
df_clean['Data'] = pd.to_datetime(df_clean['Data'])
df_clean['Year'] = df_clean['Data'].dt.year
df_clean['Month'] = df_clean['Data'].dt.month
df_clean['Day'] = df_clean['Data'].dt.day

# Drop the original 'Data' column as it's no longer needed
df_clean = df_clean.drop(columns=['Data'])

# Label Encoding for categorical features
columns_to_encode = ['Countries', 'Local', 'Industry Sector', 'Accident Level',
                     'Potential Accident Level', 'Genre', 'Employee or Third Party',
                     'Critical Risk']

# Apply Label Encoding
label_encoders = {}
for col in columns_to_encode:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Vectorize the 'Description' text field using TF-IDF
tfidf = TfidfVectorizer(max_features=100)
X_text = tfidf.fit_transform(df_clean['Description']).toarray()

# Combine the text features with the rest of the dataset (excluding 'Description' column)
X = pd.concat([df_clean.drop(columns=['Description', 'Accident Level']),
               pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])], axis=1)
y = df_clean['Accident Level']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display class distribution before resampling
print("Original class distribution:", Counter(y_train))

# Apply SMOTE with a custom sampling strategy and k_neighbors=1
smote = SMOTE(random_state=42, sampling_strategy={1: 50, 2: 50, 3: 50, 4: 50}, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display class distribution after SMOTE
print("Resampled class distribution:", Counter(y_train_resampled))

# Train an XGBoost classifier with tuned scale_pos_weight
xgb_model = XGBClassifier(random_state=42, scale_pos_weight=1, n_estimators=200, learning_rate=0.05, max_depth=5)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

Original class distribution: Counter({0: 216, 1: 28, 3: 24, 2: 24, 4: 5})
Resampled class distribution: Counter({0: 216, 3: 50, 2: 50, 1: 50, 4: 50})


Parameters: { "scale_pos_weight" } are not used.



              precision    recall  f1-score   support

           0       0.85      0.91      0.88       100
           1       0.43      0.25      0.32        12
           2       0.33      0.14      0.20         7
           3       0.11      0.17      0.13         6
           4       0.50      0.33      0.40         3

    accuracy                           0.76       128
   macro avg       0.44      0.36      0.39       128
weighted avg       0.74      0.76      0.74       128

