# Python Class for Function Management

In [3]:
from os import system, getcwd
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statistics as stt
import math

print_flag = False
print_result_flag = False
plot_flag = False


class algorithmOperation():
# =========================================================================
# core function

    def get_data(self):
        path = join(getcwd().rstrip('src'), 'data/wdbc.data').replace('\\', '/')
        self.data = pd.read_csv(path, header=None)
        if print_flag:
            print(self.data)
            print(self.data.shape)
            print(self.data.columns)
            print(self.data.head())

    def set_column_names(self):
        column_names = ['id', 'malignant',
                        'nucleus_mean', 'nucleus_se', 'nucleus_worst',
                        'texture_mean', 'texture_se', 'texture_worst',
                        'perimeter_mean', 'perimeter_se', 'perimeter_worst',
                        'area_mean', 'area_se', 'area_worst',
                        'smoothness_mean', 'smoothness_se', 'smoothness_worst',
                        'compactness_mean', 'compactness_se', 'compactness_worst',
                        'concavity_mean', 'concavity_se', 'concavity_worst',
                        'concave_pts_mean', 'concave_pts_se', 'concave_pts_worst',
                        'symmetry_mean', 'symmetry_se', 'symmetry_worst',
                        'fractal_dim_mean', 'fractal_dim_se', 'fractal_dim_worst'
                        ]

        self.data.columns = column_names
        if print_flag:
            print(self.data.shape)
            print(self.data.columns)
            print(self.data.head())
            self.data.tail(10)

    def make_data_all_numerical(self):
        self.data['malignant'] = self.data['malignant'].map(
            lambda x: 0 if x == 'B' else 1)
        if print_flag:
            self.data.tail(10)

    def split_data_into_train_test(self, sds, r):
        # dataset adjustment
        if r == 0:
            pass
        elif r <= 4:
            print('r too small. Please choose greater ratio (r>4) for ratio to take effect. Run without ratio.')
        else:
            malignant_cnt = sum(self.data['malignant'].values)
            malignant_tol = len(self.data['malignant'].values)
            n_malignant_cnt = malignant_tol - malignant_cnt
            ratio = n_malignant_cnt/malignant_cnt
            r_m_cnt = round(n_malignant_cnt/r)
            if r_m_cnt > malignant_cnt and r_m_cnt < 40:
                print('Ratio too large. Please choose smaller ratio (r>4) for ratio to take effect. Run without ratio.')
            else:
                for i in range(r_m_cnt):
                    if self.data.loc[i, 'malignant'] == 1:
                        self.data.loc[i, 'malignant'] = np.nan
                self.data.dropna(inplace=True)
            
        self.X = self.data.drop(columns=['malignant']).values
        
        # for scaled data
        if sds:
            ss = StandardScaler()
            self.X = ss.fit_transform(self.X)
        
        # data splitting
        self.y = self.data['malignant'].values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.25, random_state=2018)

    def build_mldl_model(self, n):
        if n == '1.0':
            self.model = DecisionTreeClassifier()
        if n == '2.0':
            self.model = RandomForestClassifier()
        if n == '3.0':
            self.model = KNeighborsClassifier()
        if n == '4.0':
            self.model = SVC()

    def training_mldl_model(self):
        self.model.fit(self.X_train, self.y_train)

    def test_mldl_model(self):
        self.y_pred = self.model.predict(self.X_test)

    def evaluate_the_result(self, n):
        if self.y_pred.all == self.y_test.all:
            print('Prediction successful, all values are same') if print_result_flag else None
        else:
            self.y_diff = abs(self.y_pred - self.y_test)
            self.y_diff_n = (len(self.y_test)-sum(self.y_diff))/len(self.y_test)*100

# =========================================================================
# core extension function  
            
    def prepare_data(self, sds, r):
        self.get_data()
        self.set_column_names()
        self.make_data_all_numerical()
        self.split_data_into_train_test(sds, r)

    def training_and_testing(self, n):
        self.build_mldl_model(n)
        self.training_mldl_model()
        self.test_mldl_model()
        self.evaluate_the_result(n)
        
    def single_run(self, n, sds, r):
        # n: choosing algorithm
        # sds: turn on or off data scalling
        # r: ratio of the samples >4 (default of this dataset: ~1.68, 0 for disable)
        self.prepare_data(sds, r)
        if n == 1:
            self.training_and_testing('1.0')
        elif n == 2:
            self.training_and_testing('2.0')
        elif n == 3:
            self.training_and_testing('3.0')
        elif n == 4:
            self.training_and_testing('4.0')
        else:
            print('input errer')
        return self.y_diff_n
    
# =========================================================================
# question function
    
    def q1(self, sds, r_min, r_max, r_interval):
        # sds(bool): turn on or off data scalling
        # r_max(float): the maximum of ratio
        # r_interval(float): the ration interval between each run
        ratio = np.arange(r_min+r_interval, r_max, r_interval)
        ratio_l = len(ratio)
        acc = [[]]
        tuned_ratio_max = []
        tuned_ratio_min = []
        tuned_acc_max = []
        tuned_acc_min = []
        for i in range(1, 5):
            # execute for all four algorithms
            j = 0
            for r in ratio:
                j += 1
                print('Running algorithm: ' + str(i) + '/4 - ' + str(j) + '/' + str(ratio_l) + ' - {0:.2f}%'.format(j/ratio_l*100), end='\r')
                # test all ratios
                acc_temp = self.single_run(i, sds, r)
                acc[i-1].append(acc_temp)
            index_max = np.argmax(acc[i-1])
            index_min = np.argmin(acc[i-1])
            tuned_ratio_max.append(ratio[index_max])
            tuned_ratio_min.append(ratio[index_min])
            tuned_acc_max.append(acc[i-1][index_max])
            tuned_acc_min.append(acc[i-1][index_min])
            acc.append([])
            print()
        alg = ['DT', 'RF', 'KNN', 'SVC']
        result = list(zip(alg, tuned_ratio_max, tuned_acc_max, tuned_ratio_min, tuned_acc_min))
        result = pd.DataFrame(data=result, columns=['algorithm', 'ratio_max', 'accuracy_max', 'ratio_min', 'accuracy_min'])
        print()
        print(result)

# Q1
Try to tune to get good results when the training set has imbalanced class. The ratio of the samples in two classes should be greater than 4 and has at least 40 samples minimum in a class.

In [4]:
ao = algorithmOperation()
ao.q1(True, 4, 1000, 0.1)

Running algorithm: 1/4 - 9959/9959 - 100.00%
Running algorithm: 2/4 - 9959/9959 - 100.00%
Running algorithm: 3/4 - 9959/9959 - 100.00%
Running algorithm: 4/4 - 9959/9959 - 100.00%

  algorithm  ratio_max  accuracy_max  ratio_min  accuracy_min
0        DT       14.1     99.270073       45.6     85.106383
1        RF       35.5    100.000000       42.9     90.780142
2       KNN        9.1    100.000000       14.6     93.430657
3       SVC       37.6    100.000000        4.9     95.384615
