In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
df = pd.read_csv("machine learning data.csv")
df.head()


Unnamed: 0,ID,Opinion,Source,DiseaseLabel,Label
0,1,"Fragility fractures and low bone mass, typical...",Radiologist,Osteoporosis,Correct
1,2,"Bones appear dense, trauma more likely than Os...",Doctor,Osteoporosis,Incorrect
2,3,"Definite Submandibular Stone, moderate grade, ...",Radiologist,Submandibular Stone,Correct
3,4,"Diffuse soft tissue edema, not specific for Su...",Patient,Submandibular Stone,Incorrect
4,5,"Disc protrusion compressing nerve root, consis...",Radiologist,Disc Herniation,Correct


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            500 non-null    int64 
 1   Opinion       500 non-null    object
 2   Source        500 non-null    object
 3   DiseaseLabel  500 non-null    object
 4   Label         500 non-null    object
dtypes: int64(1), object(4)
memory usage: 19.7+ KB


In [5]:
X_text = df["Opinion"]          # Text feature
X_cat1 = df["Source"]           # Categorical feature
X_cat2 = df["DiseaseLabel"]     # Categorical feature

y = df["Label"]                 # Target variable


In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Correct -> 0, Incorrect -> 1 (or vice versa)
label_encoder.classes_


array(['Correct', 'Incorrect'], dtype=object)

In [7]:
source_encoder = LabelEncoder()
disease_encoder = LabelEncoder()

X_source_encoded = source_encoder.fit_transform(X_cat1)
X_disease_encoded = disease_encoder.fit_transform(X_cat2)


In [8]:
tfidf = TfidfVectorizer(
    max_features=3000,
    stop_words='english'
)

X_text_tfidf = tfidf.fit_transform(X_text)


In [9]:
from scipy.sparse import hstack

X_final = hstack([
    X_text_tfidf,
    X_source_encoded.reshape(-1, 1),
    X_disease_encoded.reshape(-1, 1)
])


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [11]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)


In [12]:
rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
y_pred = rf_model.predict(X_test)


In [14]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.78


In [15]:
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))


              precision    recall  f1-score   support

     Correct       0.77      0.77      0.77        48
   Incorrect       0.79      0.79      0.79        52

    accuracy                           0.78       100
   macro avg       0.78      0.78      0.78       100
weighted avg       0.78      0.78      0.78       100



In [16]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix


array([[37, 11],
       [11, 41]])