In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [15]:
#data_path = r"E:\2nd year\Winter project\Thyroid Performance Analysis\Data"

In [16]:
data_path = r"C:\Users\dwija\OneDrive\Thyroid-Performance-Analysis\Data"

In [17]:
ann_test = pd.read_csv(data_path+r'\anntest.CSV')
ann_train = pd.read_csv(data_path+r'\anntrain.CSV')
hyper_test = pd.read_csv(data_path+r'\hyperTest.CSV')
hyper_train = pd.read_csv(data_path+r'\hyperTrain.CSV')
hypo_test = pd.read_csv(data_path+r'\hypoTest.CSV')
hypo_train = pd.read_csv(data_path+r'\hypoTrain.CSV')
all_hypo_train = pd.read_csv(data_path+r'\hypothyroid.csv')
euthyroid = pd.read_csv(data_path+r'\sick-euthyroid.CSV')
thyroid0387 = pd.read_csv(data_path+r'\thyroid0387EDIT.CSV')

In [18]:
data = {
    'ann test': ann_test,
    'ann train': ann_train,
    'hyper test': hyper_test,
    'hyper train': hyper_train,
    'hypo test': hypo_test,
    'hypo train': hypo_train,
}


In [19]:
hypo_hyper_data ={    'hyper test': hyper_test,
    'hyper train': hyper_train,
    'hypo test': hypo_test,
    'hypo train': hypo_train,
}

In [20]:
ann_train = ann_train.drop_duplicates()
ann_test = ann_test.drop_duplicates()

In [21]:
for df_name, df in hypo_hyper_data.items():
        df.drop("ID", axis=1, inplace=True)
        df.drop("referral_source", axis=1, inplace=True)

In [22]:
hyper_test['Target'] = hyper_test['Target'].replace(["hyperthyroid", "T3_toxic", "goitre", "secondary_toxic"], "hyperthyroid")
hyper_train['Target'] = hyper_train['Target'].replace(["hyperthyroid", "T3_toxic", "goitre", "secondary_toxic"], "hyperthyroid")
hypo_test['Target'] = hypo_test['Target'].replace(["hypothyroid", "primary_hypothyroid", "compensated_hypothyroid", "secondary_hypothyroid"],"hypothyroid")
hypo_train['Target'] = hypo_train['Target'].replace(["hypothyroid", "primary_hypothyroid", "compensated_hypothyroid", "secondary_hypothyroid"],"hypothyroid")

In [23]:
Dataset = pd.concat([hyper_test,hyper_train,hypo_test,hypo_train], ignore_index = True)

In [24]:
Dataset['sex'] = Dataset['sex'].replace({'M': 0, 'F': 1})

In [25]:
Dataset.replace('?', np.nan, inplace=True)

In [26]:
Dataset = Dataset.drop(['TBG', 'TBG_measured','sex'], axis=1)

In [35]:
Dataset.dropna(axis = 0, thresh = 22, inplace = True)
Dataset.isna().sum()

age                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
I131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH_measured                 0
TSH                          0
T3_measured                  0
T3                           0
TT4_measured                 0
TT4                          0
T4U_measured                 0
T4U                          0
FTI_measured                 0
FTI                          0
Target                       0
dtype: int64

In [28]:
Dataset = Dataset.replace({"t":1,"f":0, "y":1, "n":0, "hypothyroid":1, "negative":0,"hyperthyroid":2, "F":1, "M":0})
display(Dataset.dtypes)

age                          object
on_thyroxine                  int64
query_on_thyroxine            int64
on_antithyroid_medication     int64
sick                          int64
pregnant                      int64
thyroid_surgery               int64
I131_treatment                int64
query_hypothyroid             int64
query_hyperthyroid            int64
lithium                       int64
goitre                        int64
tumor                         int64
hypopituitary                 int64
psych                         int64
TSH_measured                  int64
TSH                          object
T3_measured                   int64
T3                           object
TT4_measured                  int64
TT4                          object
T4U_measured                  int64
T4U                          object
FTI_measured                  int64
FTI                          object
Target                        int64
dtype: object

In [29]:
cols = Dataset.columns[Dataset.dtypes.eq('object')]
Dataset[cols] = Dataset[cols].apply(pd.to_numeric, errors='coerce')
display(Dataset.dtypes)

age                          float64
on_thyroxine                   int64
query_on_thyroxine             int64
on_antithyroid_medication      int64
sick                           int64
pregnant                       int64
thyroid_surgery                int64
I131_treatment                 int64
query_hypothyroid              int64
query_hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH_measured                   int64
TSH                          float64
T3_measured                    int64
T3                           float64
TT4_measured                   int64
TT4                          float64
T4U_measured                   int64
T4U                          float64
FTI_measured                   int64
FTI                          float64
Target                         int64
dtype: object

In [30]:
Dataset = Dataset.interpolate(method = 'spline', order = 3)

In [36]:
corr_values = abs(Dataset[Dataset.columns[0:]].corr()['Target'][:])
corr_values = corr_values.drop('Target')
corr_values = corr_values[corr_values > 0.04]
display(corr_values)

on_thyroxine          0.064826
query_hyperthyroid    0.093372
tumor                 0.089330
TSH                   0.159848
T3                    0.122522
TT4                   0.057923
FTI                   0.073765
Name: Target, dtype: float64

In [38]:
def holdout(dataframe):
    # Convert Index object to a list of column names
    corr_column_names = list(corr_values.index)

    # Select relevant columns using the list of names
    x = dataframe[corr_column_names]
    y = dataframe['Target']

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
    return X_train, X_test, y_train, y_test

# Pass the DataFrame object itself, not the dictionary
X_train, X_test, y_train, y_test = holdout(Dataset)  # Use Dataset, not data


In [39]:
classifiers = {
    "Support Vector Machine (SVM)": SVC(),  # Add SVM
    "K-Nearest Neighbor (KNN)": KNeighborsClassifier(4),
    "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=1),
    "Gradient Boosting": GradientBoostingClassifier()  # Add Gradient Boosting
}


In [41]:
def classification(classifiers, X_train, X_test, y_train, y_test):
    # Create an empty list to store results
    results = []

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        pr, rc, fs, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')

        # Create a dictionary of results for each classifier
        result_dict = {
            "Classifier": name,
            "Accuracy": round(metrics.accuracy_score(y_test, y_pred), 4),
            "Precision": round(pr, 4),
            "Recall": round(rc, 4),
            "FScore": round(fs, 4)
        }

        # Append the dictionary to the results list
        results.append(result_dict)

        print("Confusion matrix for: ", name)
        display(confusion_matrix(y_test, y_pred))

    # Create the DataFrame from the list of results
    res = pd.DataFrame(results)

    # Set index and sort
    res.set_index("FScore", inplace=True)
    res.sort_values(by="FScore", ascending=False, inplace=True)

    return res


display(classification(classifiers, X_train, X_test, y_train, y_test))

Confusion matrix for:  Support Vector Machine (SVM)


array([[2034,    3,    2],
       [  89,    0,    0],
       [  28,    0,    0]], dtype=int64)

Confusion matrix for:  K-Nearest Neighbor (KNN)


array([[2022,   15,    2],
       [  85,    4,    0],
       [  28,    0,    0]], dtype=int64)

Confusion matrix for:  Random Forest


array([[1928,   83,   28],
       [  71,   18,    0],
       [  23,    0,    5]], dtype=int64)

Confusion matrix for:  Gradient Boosting


array([[1961,   58,   20],
       [  74,   15,    0],
       [  27,    0,    1]], dtype=int64)

Unnamed: 0_level_0,Classifier,Accuracy,Precision,Recall
FScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.4343,Random Forest,0.9049,0.4277,0.4421
0.3941,Gradient Boosting,0.917,0.4014,0.3887
0.3476,K-Nearest Neighbor (KNN),0.9397,0.3859,0.3455
0.3236,Support Vector Machine (SVM),0.9434,0.3152,0.3325
