In [None]:
#RANDOM FOREST MODEL CREATION

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,
classification_report
# Load dataset
data = pd.read_csv('/content/bin_converted_summary.csv')
data.fillna(data.mean(), inplace=True)
# See the data types and unique values in columns
print(data.dtypes)
for col in data.columns:
print(f'{col}: {data[col].unique()}')
categorical_cols = data.select_dtypes(include=['integer']).columns
data = pd.get_dummies(data, columns=categorical_cols.drop('ASTPhenotype'),
drop_first=True)
# Define features and target variable
X = data.drop('ASTPhenotype', axis=1) # Features
y = data['ASTPhenotype'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42,
stratify=y)
# Initialize RF model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train model
rf_model.fit(X_train, y_train)
# Make predictions on test data
y_pred = rf_model.predict(X_test)
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)
# Output important features
importance = rf_model.feature_importances_
feature_importance = pd.Series(importance, index=X.columns).sort_values(ascending=False)

print('Feature Importances:')
print(feature_importance)