# **Voting Classifier**

## **Importing Libararies**

In [113]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

## **Data Overview**

In [114]:
df = pd.read_csv('Thyroid_Diff.csv')
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [115]:
df.columns

Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')

In [116]:
df.isnull().sum()

Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64

In [117]:
df.describe

<bound method NDFrame.describe of      Age Gender Smoking Hx Smoking Hx Radiothreapy          Thyroid Function  \
0     27      F      No         No              No                 Euthyroid   
1     34      F      No        Yes              No                 Euthyroid   
2     30      F      No         No              No                 Euthyroid   
3     62      F      No         No              No                 Euthyroid   
4     62      F      No         No              No                 Euthyroid   
..   ...    ...     ...        ...             ...                       ...   
378   72      M     Yes        Yes             Yes                 Euthyroid   
379   81      M     Yes         No             Yes                 Euthyroid   
380   72      M     Yes        Yes              No                 Euthyroid   
381   61      M     Yes        Yes             Yes  Clinical Hyperthyroidism   
382   67      M     Yes         No              No                 Euthyroid   

     

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   383 non-null    int64 
 1   Gender                383 non-null    object
 2   Smoking               383 non-null    object
 3   Hx Smoking            383 non-null    object
 4   Hx Radiothreapy       383 non-null    object
 5   Thyroid Function      383 non-null    object
 6   Physical Examination  383 non-null    object
 7   Adenopathy            383 non-null    object
 8   Pathology             383 non-null    object
 9   Focality              383 non-null    object
 10  Risk                  383 non-null    object
 11  T                     383 non-null    object
 12  N                     383 non-null    object
 13  M                     383 non-null    object
 14  Stage                 383 non-null    object
 15  Response              383 non-null    ob

In [118]:
df['Recurred'].value_counts()

Recurred
No     275
Yes    108
Name: count, dtype: int64

In [119]:
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
categorical_columns = df.select_dtypes(exclude=['number']).columns.tolist()

In [120]:
numerical_columns

['Age']

In [121]:
categorical_columns

['Gender',
 'Smoking',
 'Hx Smoking',
 'Hx Radiothreapy',
 'Thyroid Function',
 'Physical Examination',
 'Adenopathy',
 'Pathology',
 'Focality',
 'Risk',
 'T',
 'N',
 'M',
 'Stage',
 'Response',
 'Recurred']

In [122]:
categorical_columns = np.delete(categorical_columns, len(categorical_columns)-1)

## **Train Test Split**

In [123]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Recurred']), df['Recurred'], 
                                                    test_size=0.2, 
                                                    random_state=42)

## **Preprocessing Data**

In [124]:
handle_numerical = Pipeline(steps=[
    ('impute_numerical', SimpleImputer(strategy='mean'))
])

In [125]:
handle_categorical = Pipeline(steps=[
    ('impute_numerical', SimpleImputer(strategy='most_frequent')),
    ('encode_categorical', OrdinalEncoder())
])

In [126]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', handle_numerical, numerical_columns),
    ('categorical', handle_categorical, categorical_columns)
], remainder='passthrough')

## **Separate Models**

In [127]:
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()
model_svc = SVC(probability=True)

In [128]:
pipe_dt = make_pipeline(preprocessing, model_dt)
pipe_rf = make_pipeline(preprocessing, model_rf)
pipe_svc = make_pipeline(preprocessing, model_svc)

In [129]:
pipe_dt.fit(X_train, y_train)

In [130]:
pipe_rf.fit(X_train, y_train)

In [131]:
pipe_svc.fit(X_train, y_train)

In [132]:
y_pred_dt = pipe_dt.predict(X_test)
y_pred_rf = pipe_rf.predict(X_test)
y_pred_svc = pipe_svc.predict(X_test)

In [133]:
print(classification_report(y_pred_dt, y_test))

              precision    recall  f1-score   support

          No       0.93      0.96      0.95        56
         Yes       0.89      0.81      0.85        21

    accuracy                           0.92        77
   macro avg       0.91      0.89      0.90        77
weighted avg       0.92      0.92      0.92        77



In [134]:
print(classification_report(y_pred_rf, y_test))

              precision    recall  f1-score   support

          No       1.00      0.98      0.99        59
         Yes       0.95      1.00      0.97        18

    accuracy                           0.99        77
   macro avg       0.97      0.99      0.98        77
weighted avg       0.99      0.99      0.99        77



In [135]:
print(classification_report(y_pred_svc, y_test))

              precision    recall  f1-score   support

          No       1.00      0.82      0.90        71
         Yes       0.32      1.00      0.48         6

    accuracy                           0.83        77
   macro avg       0.66      0.91      0.69        77
weighted avg       0.95      0.83      0.87        77



## **Voting Classifier**

In [136]:
voting = VotingClassifier(estimators=[
    ('dt', pipe_dt),
    ('rf', pipe_rf),
    ('svc', pipe_svc)
], voting='soft')

In [137]:
voting.fit(X_train, y_train)

In [138]:
y_pred_voting = voting.predict(X_test)

In [139]:
print(classification_report(y_pred_voting, y_test))

              precision    recall  f1-score   support

          No       0.98      0.98      0.98        58
         Yes       0.95      0.95      0.95        19

    accuracy                           0.97        77
   macro avg       0.97      0.97      0.97        77
weighted avg       0.97      0.97      0.97        77

