# Python Class for Function Management

In [76]:
from os import system, getcwd
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statistics as stt
import math

print_flag = False
print_result_flag = False
plot_flag = False


class algorithmOperation():
# =========================================================================
# core function

    def get_data(self):
        path = join(getcwd().rstrip('src'), 'data/wdbc.data').replace('\\', '/')
        self.data = pd.read_csv(path, header=None)
        if print_flag:
            print(self.data)
            print(self.data.shape)
            print(self.data.columns)
            print(self.data.head())

    def set_column_names(self, drop_col):
        # drop_col(list): columns to drop
        self.column_names = ['id', 'malignant',
                        'nucleus_mean', 'nucleus_se', 'nucleus_worst',
                        'texture_mean', 'texture_se', 'texture_worst',
                        'perimeter_mean', 'perimeter_se', 'perimeter_worst',
                        'area_mean', 'area_se', 'area_worst',
                        'smoothness_mean', 'smoothness_se', 'smoothness_worst',
                        'compactness_mean', 'compactness_se', 'compactness_worst',
                        'concavity_mean', 'concavity_se', 'concavity_worst',
                        'concave_pts_mean', 'concave_pts_se', 'concave_pts_worst',
                        'symmetry_mean', 'symmetry_se', 'symmetry_worst',
                        'fractal_dim_mean', 'fractal_dim_se', 'fractal_dim_worst'
                        ]
        self.data.columns = self.column_names
        if print_flag:
            print(self.data.shape)
            print(self.data.columns)
            print(self.data.head())
            self.data.tail(10)
        if len(drop_col) == 0:
#             print('No column is dropped.')
            pass
        else:
            print('Dropped column: ' + str(drop_col) + '                     ', end='\r')
            self.data.drop(columns=drop_col, inplace=True)

    def make_data_all_numerical(self):
        self.data['malignant'] = self.data['malignant'].map(
            lambda x: 0 if x == 'B' else 1)
        if print_flag:
            self.data.tail(10)

    def split_data_into_train_test(self, sds, r):
        # dataset adjustment
        if r == 0:
            pass
        elif r <= 4:
            print('r too small. Please choose greater ratio (r>4) for ratio to take effect. Run without ratio.')
        else:
            malignant_cnt = sum(self.data['malignant'].values)
            malignant_tol = len(self.data['malignant'].values)
            n_malignant_cnt = malignant_tol - malignant_cnt
            ratio = n_malignant_cnt/malignant_cnt
            r_m_cnt = round(n_malignant_cnt/r)
            if r_m_cnt > malignant_cnt and r_m_cnt < 40:
                print('Ratio too large. Please choose smaller ratio (r>4) for ratio to take effect. Run without ratio.')
            else:
                for i in range(r_m_cnt):
                    if self.data.loc[i, 'malignant'] == 1:
                        self.data.loc[i, 'malignant'] = np.nan
                self.data.dropna(inplace=True)
            
        self.X = self.data.drop(columns=['malignant']).values
        
        # for scaled data
        if sds:
            ss = StandardScaler()
            self.X = ss.fit_transform(self.X)
        
        # data splitting
        self.y = self.data['malignant'].values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.25, random_state=2018)

    def build_mldl_model(self, n):
        if n == 1:
            self.model = DecisionTreeClassifier()
        elif n == 2:
            self.model = RandomForestClassifier()
        elif n == 3:
            self.model = KNeighborsClassifier()
        elif n == 4:
            self.model = SVC()
        else:
            print('input error')

    def training_mldl_model(self):
        self.model.fit(self.X_train, self.y_train)

    def test_mldl_model(self):
        self.y_pred = self.model.predict(self.X_test)

    def evaluate_the_result(self, n):
        if self.y_pred.all == self.y_test.all:
            print('Prediction successful, all values are same') if print_result_flag else None
        else:
            self.y_diff = abs(self.y_pred - self.y_test)
            self.y_diff_n = (len(self.y_test)-sum(self.y_diff))/len(self.y_test)*100

# =========================================================================
# core extension function            
            
    def prepare_data(self, drop_col, sds, r):
        self.get_data()
        self.set_column_names(drop_col)
        self.make_data_all_numerical()
        self.split_data_into_train_test(sds, r)

    def training_and_testing(self, n):
        self.build_mldl_model(n)
        self.training_mldl_model()
        self.test_mldl_model()
        self.evaluate_the_result(n)
        
    def single_run(self, n, drop_col, sds, r):
        # n: choosing algorithm
        # sds: turn on or off data scalling
        # r: ratio of the samples >4 (default of this dataset: ~1.68, 0 for disable)
        self.prepare_data(drop_col, sds, r)
        self.training_and_testing(n)
        return self.y_diff_n
    
# =========================================================================
# question function
    
    def q2(self, sds):
        # info
        alg = ['DT', 'RF', 'KNN', 'SVC']
        col = ['id',
                'nucleus_mean', 'nucleus_se', 'nucleus_worst',
                'texture_mean', 'texture_se', 'texture_worst',
                'perimeter_mean', 'perimeter_se', 'perimeter_worst',
                'area_mean', 'area_se', 'area_worst',
                'smoothness_mean', 'smoothness_se', 'smoothness_worst',
                'compactness_mean', 'compactness_se', 'compactness_worst',
                'concavity_mean', 'concavity_se', 'concavity_worst',
                'concave_pts_mean', 'concave_pts_se', 'concave_pts_worst',
                'symmetry_mean', 'symmetry_se', 'symmetry_worst',
                'fractal_dim_mean', 'fractal_dim_se', 'fractal_dim_worst'
                ]
        
        # mode 0: default run
        acc_md0 = []
        for i in range(1, 5):
            acc_md0.append(self.single_run(i, [], sds, 0))
            
        # mode 1: auto drop 1 element at a time
        acc_md1 = []
        acc_buf = []
        for i in range(1, 5):
            for x in col:
                acc_buf.append(self.single_run(i, [x], sds, 0))
            acc_md1.append(max(acc_buf))
            
        # mode 2: auto drop 2 element at a time
        acc_md2 = []
        acc_buf = []
        for i in range(1, 5):
            for j in range(len(col)):
                for k in range(len(col)):
                    if j != k:
                        acc_buf.append(self.single_run(i, [col[j], col[k]], sds, 0))
            acc_md2.append(max(acc_buf))
            
        # show result
        result = list(zip(alg, acc_md0, acc_md1, np.subtract(acc_md1, acc_md0), acc_md2, np.subtract(acc_md2, acc_md0)))
        result = pd.DataFrame(data=result, columns=['algorithm', 'default', 'drop 1', 'drop 1 diff', 'drop 2', 'drop 2 diff'])
        print(result)
        

## Q2
Investigate whether removing some features can improve the results.

In [77]:
ao = algorithmOperation()
ao.q2(False)

  algorithm    default     drop 1  drop 1 diff     drop 2  drop 2 diff         
0        DT  90.909091  94.405594     3.496503  95.804196     4.895105
1        RF  93.006993  97.202797     4.195804  95.804196     2.797203
2       KNN  74.125874  97.202797    23.076923  95.804196    21.678322
3       SVC  62.237762  97.202797    34.965035  95.804196    33.566434
