In [9]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [11]:
df = pd.read_csv("thyroid_dataset_women.csv")

In [13]:
# Rename the columns for better understanding
df.rename(columns={'Hx Smoking': 'Smoking History',
                   'Hx Radiothreapy': 'Radiotherapy History',
                   'T': 'Tumor',
                   'N': 'Lymph Nodes',
                   'M': 'Cancer Metastasis',
                   'Response': 'Treatment Response'}, inplace=True)

In [15]:
# Encoding categorical columns using LabelEncoder
label_Gender = LabelEncoder()
label_Smoking = LabelEncoder()
label_Smoking_History = LabelEncoder()
label_Radiotherapy_History = LabelEncoder()
label_Thyroid_Function = LabelEncoder()
label_Physical_Examination = LabelEncoder()
label_Adenopathy = LabelEncoder()
label_Pathology = LabelEncoder()
label_Focality = LabelEncoder()
label_Cancer_Metastasis = LabelEncoder()
label_Lymph_Nodes = LabelEncoder()
label_Stage = LabelEncoder()
label_Tumor = LabelEncoder()
label_Treatment_Response = LabelEncoder()
label_Recurred = LabelEncoder()

In [17]:
# Fit and transform the label encoding
df['Gender'] = label_Gender.fit_transform(df['Gender'])
df['Smoking'] = label_Smoking.fit_transform(df['Smoking'])
df['Smoking History'] = label_Smoking_History.fit_transform(df['Smoking History'])
df['Radiotherapy History'] = label_Radiotherapy_History.fit_transform(df['Radiotherapy History'])
df['Thyroid Function'] = label_Thyroid_Function.fit_transform(df['Thyroid Function'])
df['Physical Examination'] = label_Physical_Examination.fit_transform(df['Physical Examination'])
df['Adenopathy'] = label_Adenopathy.fit_transform(df['Adenopathy'])
df['Pathology'] = label_Pathology.fit_transform(df['Pathology'])
df['Focality'] = label_Focality.fit_transform(df['Focality'])
df['Cancer Metastasis'] = label_Cancer_Metastasis.fit_transform(df['Cancer Metastasis'])
df['Lymph Nodes'] = label_Lymph_Nodes.fit_transform(df['Lymph Nodes'])
df['Tumor'] = label_Tumor.fit_transform(df['Tumor'])
df['Stage'] = label_Stage.fit_transform(df['Stage'])
df['Treatment Response'] = label_Treatment_Response.fit_transform(df['Treatment Response'])
df['Recurred'] = label_Recurred.fit_transform(df['Recurred'])

In [19]:
# Ordinal encoding for the 'Risk' column
categories = [['Low', 'Intermediate', 'High']]
oe = OrdinalEncoder(categories=categories)
df['Risk'] = oe.fit_transform(df[['Risk']])

In [21]:
# Segregating features and target variable
x = df.drop('Recurred', axis=1)  # Features
y = df['Recurred']  # Target

In [23]:
# Standardizing data
pre_process = StandardScaler().fit(x)
x_transform = pre_process.transform(x)

In [25]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x_transform, y, test_size=0.10, random_state=101)

In [27]:
# Initialize all models
rfc = RandomForestClassifier()                      # Random Forest Classifier
svc = SVC()                                          # Support Vector Classifier
lr = LogisticRegression()                            # Logistic Regression (as a substitute for Linear Regression)
bagging_rfc = BaggingClassifier(estimator=rfc)       # Bagging with Random Forest
xgb = XGBClassifier()                                # Boosting with XGBoost

In [29]:
# Train all models
rfc.fit(x_train, y_train)
svc.fit(x_train, y_train)
lr.fit(x_train, y_train)
bagging_rfc.fit(x_train, y_train)
xgb.fit(x_train, y_train)

In [None]:
def predict_thyroid_recurrence(model_choice, pre_process):
    # Debugging: Check the classes that the LabelEncoder for Focality was trained on
    print("Classes for Focality:", label_Focality.classes_)

    # Collect user inputs, ensuring correct case for Focality
    age = int(input("Enter age: "))
    gender = input("Enter gender (M/F): ").strip().upper()
    smoking = input("Do you smoke? (Yes/No): ").strip().capitalize()
    hx_smoking = input("Do you have a history of smoking? (Yes/No): ").strip().capitalize()
    hx_radiotherapy = input("Have you had radiotherapy? (Yes/No): ").strip().capitalize()
    thyroid_function = input("Thyroid function (Euthyroid/Hypothyroid/Hyperthyroid): ").strip().capitalize()
    physical_exam = input("Physical Examination (Single/Multinodular): ").strip().capitalize()
    adenopathy = input("Adenopathy present? (Yes/No): ").strip().capitalize()
    pathology = input("Pathology (Micropapillary/Other): ").strip().capitalize()
    
    # Ensure Focality matches expected case ('Uni-Focal', 'Multi-Focal')
    focality = input("Focality (Uni-Focal/Multi-Focal): ").strip().title()  # Convert to title case ('Uni-Focal')
    
    risk = input("Risk (Low/Intermediate/High): ").strip().capitalize()
    t = input("Tumor (T1a/T1b/Other): ").strip()
    n = input("Lymph nodes (N0/N1/Other): ").strip()
    m = input("Metastasis (M0/M1/Other): ").strip()
    stage = input("Stage (I/II/III/IV): ").strip().upper()
    response = input("Treatment Response (Excellent/Indeterminate): ").strip().capitalize()

    # Print the exact focality input before encoding
    print(f"Focality input after cleaning: '{focality}'")

    # Create a dataframe with the inputs, renaming columns to match training features
    input_data = pd.DataFrame({
        'Age': [age],
        'Gender': [gender],  
        'Smoking': [smoking],
        'Smoking History': [hx_smoking],
        'Radiotherapy History': [hx_radiotherapy],
        'Thyroid Function': [thyroid_function],
        'Physical Examination': [physical_exam],
        'Adenopathy': [adenopathy],
        'Pathology': [pathology],
        'Focality': [focality],  
        'Risk': [risk],
        'Tumor': [t],  # Renamed from 'T' to 'Tumor'
        'Lymph Nodes': [n],  # Renamed from 'N' to 'Lymph Nodes'
        'Cancer Metastasis': [m],  # Renamed from 'M' to 'Cancer Metastasis'
        'Stage': [stage],
        'Treatment Response': [response]  # Renamed from 'Response' to 'Treatment Response'
    })

    # Perform preprocessing (label encoding and scaling)
    input_data['Gender'] = label_Gender.transform(input_data['Gender'])
    input_data['Smoking'] = label_Smoking.transform(input_data['Smoking'])
    input_data['Smoking History'] = label_Smoking_History.transform(input_data['Smoking History'])
    input_data['Radiotherapy History'] = label_Radiotherapy_History.transform(input_data['Radiotherapy History'])
    input_data['Thyroid Function'] = label_Thyroid_Function.transform(input_data['Thyroid Function'])
    input_data['Physical Examination'] = label_Physical_Examination.transform(input_data['Physical Examination'])
    input_data['Adenopathy'] = label_Adenopathy.transform(input_data['Adenopathy'])
    input_data['Pathology'] = label_Pathology.transform(input_data['Pathology'])
    input_data['Focality'] = label_Focality.transform(input_data['Focality'])
    input_data['Risk'] = oe.transform(input_data[['Risk']])
    input_data['Tumor'] = label_Tumor.transform(input_data['Tumor'])
    input_data['Lymph Nodes'] = label_Lymph_Nodes.transform(input_data['Lymph Nodes'])
    input_data['Cancer Metastasis'] = label_Cancer_Metastasis.transform(input_data['Cancer Metastasis'])
    input_data['Stage'] = label_Stage.transform(input_data['Stage'])
    input_data['Treatment Response'] = label_Treatment_Response.transform(input_data['Treatment Response'])

    # Standardize the input data
    input_data_scaled = pre_process.transform(input_data)

    # Select model based on user choice
    if model_choice == 'random_forest':
        prediction = rfc.predict(input_data_scaled)
    elif model_choice == 'svm':
        prediction = svc.predict(input_data_scaled)
    elif model_choice == 'logistic_regression':
        prediction = lr.predict(input_data_scaled)
    elif model_choice == 'bagging_rfc':
        prediction = bagging_rfc.predict(input_data_scaled)
    elif model_choice == 'xgboost':
        prediction = xgb.predict(input_data_scaled)

    # Output the result
    result = "Thyroid condition detected." if prediction[0] == 1 else "Your results indicate no presence of thyroid disease."
    print(f"Prediction: {result}")


# Example call to the function:
predict_thyroid_recurrence('random_forest', pre_process)

Classes for Focality: ['Multi-Focal' 'Uni-Focal']


Enter age:  21
Enter gender (M/F):  F
Do you smoke? (Yes/No):  Yes
Do you have a history of smoking? (Yes/No):  Yes
Have you had radiotherapy? (Yes/No):  Yes
Thyroid function (Euthyroid/Hypothyroid/Hyperthyroid):  Euthyroid
Physical Examination (Single/Multinodular):  Single nodular goiter-left
Adenopathy present? (Yes/No):  No
Pathology (Micropapillary/Other):  Micropapillary
Focality (Uni-Focal/Multi-Focal):  Uni-Focal
Risk (Low/Intermediate/High):  Low
Tumor (T1a/T1b/Other):  T1a
Lymph nodes (N0/N1/Other):  N0
Metastasis (M0/M1/Other):  M0


In [35]:
import joblib
from sklearn.ensemble import RandomForestClassifier  

joblib.dump(rfc, 'thyroid_model_core.pkl')

['thyroid_model_core.pkl']

In [37]:
import os
os.getcwd()

'C:\\Users\\Anusha'

In [30]:
import joblib
from sklearn.preprocessing import StandardScaler

# Assuming you have already created and fit the scaler as `pre_process`
scaler_path = 'D:/thyroidweb/thyroid/backend/thyroid_scaler.pkl'
joblib.dump(pre_process, scaler_path)


['D:/thyroidweb/thyroid/backend/thyroid_scaler.pkl']