In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv('Thyroid_Diff.csv')
data.head()
X = data.drop(columns=['Recurred'])  
y = data['Recurred']  

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
logreg = LogisticRegression(max_iter=1000)  
svc = SVC()  
rf = RandomForestClassifier()  

In [None]:
models = {'Logistic Regression': logreg, 'SVM': svc, 'Random Forest': rf}
results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

print("Model Accuracies:", results)

Model Accuracies: {'Logistic Regression': 0.961038961038961, 'SVM': 0.987012987012987, 'Random Forest': 0.987012987012987}


In [None]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"Best-performing model: {best_model_name} with accuracy {results[best_model_name]}")

def get_user_input():
    input_data = {}
    
    input_data['Age'] = float(input("Enter Age: "))
    input_data['Gender'] = input("Enter Gender (M/F): ")
    input_data['Smoking'] = input("Smoking (Yes/No): ")
    input_data['Hx Smoking'] = input("Hx Smoking (Yes/No): ")
    input_data['Hx Radiotherapy'] = input("Hx Radiotherapy (Yes/No): ")
    input_data['Thyroid Function'] = input("Enter Thyroid Function (e.g., Euthyroid, Hyperthyroid): ")
    input_data['Physical Examination'] = input("Enter Physical Examination findings: ")
    input_data['Adenopathy'] = input("Adenopathy (Yes/No): ")
    input_data['Pathology'] = input("Enter Pathology (e.g., Micropapillary): ")
    input_data['Focality'] = input("Enter Focality (e.g., Uni-Focal, Multi-Focal): ")
    input_data['Risk'] = input("Enter Risk (Low/High): ")
    input_data['T'] = input("Enter T staging (e.g., T1a): ")
    input_data['N'] = input("Enter N staging (e.g., N0): ")
    input_data['M'] = input("Enter M staging (e.g., M0): ")
    input_data['Stage'] = input("Enter Stage (e.g., I, II): ")
    input_data['Response'] = input("Enter Response (e.g., Excellent, Indeterminate): ")

    return input_data

def prepare_input_for_prediction(input_data):
    input_df = pd.DataFrame([input_data])  
    input_encoded = pd.get_dummies(input_df)
    
    missing_cols = set(X_encoded.columns) - set(input_encoded.columns)
    for col in missing_cols:
        input_encoded[col] = 0  
    
    input_encoded = input_encoded[X_encoded.columns]

    input_scaled = scaler.transform(input_encoded)
    
    return input_scaled

new_input = get_user_input()

prepared_input = prepare_input_for_prediction(new_input)

predicted_outcome = best_model.predict(prepared_input)
predicted_label = label_encoder.inverse_transform(predicted_outcome)  # Convert back to original labels
print(f"Predicted outcome for new input: {predicted_label[0]}")

Best-performing model: SVM with accuracy 0.987012987012987


Predicted outcome for new input: No
