In [72]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [73]:
data = pd.read_csv("medical_dataset.csv")

data.head()


Unnamed: 0,Input,Output,Source,Disease,Opinion,Image
0,Image: Metacarpal_Fracture_img_362.png | Opini...,Correct,Doctor,Metacarpal Fracture,Spiculated lung mass [Metacarpal Fracture],Metacarpal_Fracture_img_362.png
1,Image: Osteoporosis_img_074.png | Opinion: Gli...,Correct,Specialist,Osteoporosis,Glioblastoma necrosis [Osteoporosis],Osteoporosis_img_074.png
2,Image: Uterine_Fibroids_img_375.png | Opinion:...,Correct,Radiologist,Uterine Fibroids,Diverticulitis fat stranding [Uterine Fibroids],Uterine_Fibroids_img_375.png
3,Image: Disc_Herniation_img_156.png | Opinion: ...,Incorrect,Patient,Disc Herniation,Nothing wrong [Disc Herniation],Disc_Herniation_img_156.png
4,Image: Lung_Cancer_img_105.png | Opinion: Susp...,Correct,Radiologist,Lung Cancer,Suspicious thyroid nodule [Lung Cancer],Lung_Cancer_img_105.png


In [74]:
print(data.columns)
print(data.info())


Index(['Input', 'Output', 'Source', 'Disease', 'Opinion', 'Image'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Input    500 non-null    object
 1   Output   500 non-null    object
 2   Source   500 non-null    object
 3   Disease  500 non-null    object
 4   Opinion  500 non-null    object
 5   Image    500 non-null    object
dtypes: object(6)
memory usage: 23.6+ KB
None


In [75]:
X = data["Opinion"]   # feature
y = data["Output"]    # target


In [76]:
opinion_encoder = LabelEncoder()
X_encoded = opinion_encoder.fit_transform(X)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [77]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded.reshape(-1, 1),
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [78]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)


In [79]:
rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [80]:
y_pred = rf_model.predict(X_test)


In [81]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:\n")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))


Accuracy: 0.73

Classification Report:

              precision    recall  f1-score   support

     Correct       0.81      0.85      0.83        78
   Incorrect       0.37      0.32      0.34        22

    accuracy                           0.73       100
   macro avg       0.59      0.58      0.59       100
weighted avg       0.72      0.73      0.72       100



In [82]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Confusion Matrix:
 [[66 12]
 [15  7]]


In [84]:
feature_names = list(tfidf.get_feature_names_out()) + ["Source_Doctor"]
importances = rf_model.feature_importances_

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

importance_df.head(15)


ValueError: All arrays must be of the same length