<a href="https://colab.research.google.com/github/brotheramin/MachineLearning/blob/main/Safety%20Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv')

# Data cleaning: Drop unnecessary columns and extract date-related features
df_clean = df.drop(columns=['Unnamed: 0'])  # Dropping the index column if it exists

# Convert the date to datetime and extract Year, Month, and Day
df_clean['Data'] = pd.to_datetime(df_clean['Data'])
df_clean['Year'] = df_clean['Data'].dt.year
df_clean['Month'] = df_clean['Data'].dt.month
df_clean['Day'] = df_clean['Data'].dt.day

# Drop the original 'Data' column as it's no longer needed
df_clean = df_clean.drop(columns=['Data'])

# Label Encoding for categorical features
columns_to_encode = ['Countries', 'Local', 'Industry Sector', 'Accident Level',
                     'Potential Accident Level', 'Genre', 'Employee or Third Party',
                     'Critical Risk']

# Apply Label Encoding
label_encoders = {}
for col in columns_to_encode:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Vectorize the 'Description' text field using TF-IDF
tfidf = TfidfVectorizer(max_features=100)
X_text = tfidf.fit_transform(df_clean['Description']).toarray()

# Combine the text features with the rest of the dataset (excluding 'Description' column)
X = pd.concat([df_clean.drop(columns=['Description', 'Accident Level']),
               pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])], axis=1)
y = df_clean['Accident Level']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = rf_model.predict(X_test)
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.78      1.00      0.88       100
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         3

    accuracy                           0.78       128
   macro avg       0.16      0.20      0.18       128
weighted avg       0.61      0.78      0.69       128



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv')

# Data cleaning: Drop unnecessary columns and extract date-related features
df_clean = df.drop(columns=['Unnamed: 0'])  # Dropping the index column if it exists

# Convert the date to datetime and extract Year, Month, and Day
df_clean['Data'] = pd.to_datetime(df_clean['Data'])
df_clean['Year'] = df_clean['Data'].dt.year
df_clean['Month'] = df_clean['Data'].dt.month
df_clean['Day'] = df_clean['Data'].dt.day

# Drop the original 'Data' column as it's no longer needed
df_clean = df_clean.drop(columns=['Data'])

# Label Encoding for categorical features
columns_to_encode = ['Countries', 'Local', 'Industry Sector', 'Accident Level',
                     'Potential Accident Level', 'Genre', 'Employee or Third Party',
                     'Critical Risk']

# Apply Label Encoding
label_encoders = {}
for col in columns_to_encode:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Vectorize the 'Description' text field using TF-IDF
tfidf = TfidfVectorizer(max_features=100)
X_text = tfidf.fit_transform(df_clean['Description']).toarray()

# Combine the text features with the rest of the dataset (excluding 'Description' column)
X = pd.concat([df_clean.drop(columns=['Description', 'Accident Level']),
               pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])], axis=1)
y = df_clean['Accident Level']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Handle class imbalance using RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Train a Random Forest classifier with balanced classes
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
y_pred = rf_model.predict(X_test)
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.79      0.99      0.88       100
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         3

    accuracy                           0.77       128
   macro avg       0.16      0.20      0.18       128
weighted avg       0.61      0.77      0.68       128



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Install xgboost if not already installed
!pip install xgboost

# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv')

# Data cleaning: Drop unnecessary columns and extract date-related features
df_clean = df.drop(columns=['Unnamed: 0'])  # Dropping the index column if it exists

# Convert the date to datetime and extract Year, Month, and Day
df_clean['Data'] = pd.to_datetime(df_clean['Data'])
df_clean['Year'] = df_clean['Data'].dt.year
df_clean['Month'] = df_clean['Data'].dt.month
df_clean['Day'] = df_clean['Data'].dt.day

# Drop the original 'Data' column as it's no longer needed
df_clean = df_clean.drop(columns=['Data'])

# Label Encoding for categorical features
columns_to_encode = ['Countries', 'Local', 'Industry Sector', 'Accident Level',
                     'Potential Accident Level', 'Genre', 'Employee or Third Party',
                     'Critical Risk']

# Apply Label Encoding
label_encoders = {}
for col in columns_to_encode:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Vectorize the 'Description' text field using TF-IDF
tfidf = TfidfVectorizer(max_features=100)
X_text = tfidf.fit_transform(df_clean['Description']).toarray()

# Combine the text features with the rest of the dataset (excluding 'Description' column)
X = pd.concat([df_clean.drop(columns=['Description', 'Accident Level']),
               pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])], axis=1)
y = df_clean['Accident Level']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the training set with k_neighbors set to 3
smote = SMOTE(random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train an XGBoost classifier with class weighting
xgb_model = XGBClassifier(random_state=42, scale_pos_weight=1)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)



Parameters: { "scale_pos_weight" } are not used.



              precision    recall  f1-score   support

           0       0.84      0.87      0.86       100
           1       0.00      0.00      0.00        12
           2       0.20      0.14      0.17         7
           3       0.18      0.33      0.24         6
           4       0.50      0.33      0.40         3

    accuracy                           0.71       128
   macro avg       0.35      0.34      0.33       128
weighted avg       0.69      0.71      0.70       128



In [9]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv')

# Data cleaning: Drop unnecessary columns and extract date-related features
df_clean = df.drop(columns=['Unnamed: 0'])  # Dropping the index column if it exists

# Convert the date to datetime and extract Year, Month, and Day
df_clean['Data'] = pd.to_datetime(df_clean['Data'])
df_clean['Year'] = df_clean['Data'].dt.year
df_clean['Month'] = df_clean['Data'].dt.month
df_clean['Day'] = df_clean['Data'].dt.day

# Drop the original 'Data' column as it's no longer needed
df_clean = df_clean.drop(columns=['Data'])

# Label Encoding for categorical features
columns_to_encode = ['Countries', 'Local', 'Industry Sector', 'Accident Level',
                     'Potential Accident Level', 'Genre', 'Employee or Third Party',
                     'Critical Risk']

# Apply Label Encoding
label_encoders = {}
for col in columns_to_encode:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Vectorize the 'Description' text field using TF-IDF
tfidf = TfidfVectorizer(max_features=100)
X_text = tfidf.fit_transform(df_clean['Description']).toarray()

# Combine the text features with the rest of the dataset (excluding 'Description' column)
X = pd.concat([df_clean.drop(columns=['Description', 'Accident Level']),
               pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])], axis=1)
y = df_clean['Accident Level']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the training set with k_neighbors set to 3
smote = SMOTE(random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
y_pred = rf_model.predict(X_test)
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.77      0.95      0.85       100
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         3

    accuracy                           0.74       128
   macro avg       0.15      0.19      0.17       128
weighted avg       0.60      0.74      0.67       128



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
print("Original class distribution:", Counter(y_train))

Original class distribution: Counter({0: 216, 1: 28, 3: 24, 2: 24, 4: 5})


In [13]:
smote = SMOTE(random_state=42, sampling_strategy={1: 50, 2: 50, 3: 50, 4: 50}, k_neighbors=2)

In [14]:
# Get the current class distribution
class_counts = Counter(y_train)

# Set the target number for each class to be the minimum of:
# a) 50 (your original target)
# b) The current count for that class
# c) The minimum count among all classes
min_count = min(class_counts.values())
sampling_strategy = {cls: min(count, 50, min_count) for cls, count in class_counts.items()}

smote = SMOTE(random_state=42, sampling_strategy=sampling_strategy, k_neighbors=2)

In [16]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

# Display class distribution before resampling
print("Original class distribution:", Counter(y_train))

# Get the current class distribution
class_counts = Counter(y_train)

# Set the target number for each class
max_count = max(class_counts.values())
sampling_strategy = {cls: max(count, min(max_count, 50)) for cls, count in class_counts.items()}

# Use RandomOverSampler
ros = RandomOverSampler(random_state=42, sampling_strategy=sampling_strategy)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Display class distribution after resampling
print("Resampled class distribution:", Counter(y_train_resampled))

Original class distribution: Counter({0: 216, 1: 28, 3: 24, 2: 24, 4: 5})
Resampled class distribution: Counter({0: 216, 3: 50, 2: 50, 1: 50, 4: 50})


In [17]:
xgb_model = XGBClassifier(random_state=42, scale_pos_weight=1, n_estimators=200, learning_rate=0.05, max_depth=5)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
y_pred = xgb_model.predict(X_test)
report = classification_report(y_test, y_pred)

# Print the classification report
print(report)

Parameters: { "scale_pos_weight" } are not used.



              precision    recall  f1-score   support

           0       0.84      0.96      0.90       100
           1       0.25      0.08      0.12        12
           2       0.00      0.00      0.00         7
           3       0.17      0.17      0.17         6
           4       1.00      0.67      0.80         3

    accuracy                           0.78       128
   macro avg       0.45      0.38      0.40       128
weighted avg       0.71      0.78      0.74       128

