# BCW dataset has 32 columns, including 1 label.   Which features are most sensitive to cancer?   Check correlation between features and label with dataframe's method "corr" or anything you think is helpful to determine.  Also find out whethe standard scaler would change the correlation or not.  Make sure to submit your codes and your description of the findings.

## Python Class for Function Management

In [24]:
from os import system, getcwd
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import statistics as stt

print_flag = False
print_result_flag = False
plot_flag = False


class algorithmOperation():
    def get_data(self):
        path = join(getcwd(), 'wdbc.data').replace('\\', '/')
        self.data = pd.read_csv(path, header=None)
        if print_flag:
            print(self.data)
            print(self.data.shape)
            print(self.data.columns)
            print(self.data.head())

    def set_column_names(self):
        column_names = ['id', 'malignant',
                        'nucleus_mean', 'nucleus_se', 'nucleus_worst',
                        'texture_mean', 'texture_se', 'texture_worst',
                        'perimeter_mean', 'perimeter_se', 'perimeter_worst',
                        'area_mean', 'area_se', 'area_worst',
                        'smoothness_mean', 'smoothness_se', 'smoothness_worst',
                        'compactness_mean', 'compactness_se', 'compactness_worst',
                        'concavity_mean', 'concavity_se', 'concavity_worst',
                        'concave_pts_mean', 'concave_pts_se', 'concave_pts_worst',
                        'symmetry_mean', 'symmetry_se', 'symmetry_worst',
                        'fractal_dim_mean', 'fractal_dim_se', 'fractal_dim_worst'
                        ]

        self.data.columns = column_names
        if print_flag:
            print(self.data.shape)
            print(self.data.columns)
            print(self.data.head())
            self.data.tail(10)

    def make_data_all_numerical(self):
        self.data['malignant'] = self.data['malignant'].map(
            lambda x: 0 if x == 'B' else 1)
        if print_flag:
            self.data.tail(10)

    def split_data_into_train_test(self, sds):
        self.X = self.data.drop(columns=['malignant']).values
        
        if sds:
            # for scaled data
            ss = StandardScaler()
            self.X = ss.fit_transform(self.X)
        
        self.y = self.data['malignant'].values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.25, random_state=2018)

    def build_mldl_model(self, n):
        if n == '1.0':
            # Default DT
            self.model = DecisionTreeClassifier()
        if n == '2.0':
            self.model = RandomForestClassifier()
        if n == '3.0':
            self.model = KNeighborsClassifier()
        if n == '4.0':
            self.model = SVC()

    def training_mldl_model(self):
        self.model.fit(self.X_train, self.y_train)

    def test_mldl_model(self):
        self.y_pred = self.model.predict(self.X_test)

    def evaluate_the_result(self, n):
        if self.y_pred.all == self.y_test.all:
            print('Prediction successful, all values are same') if print_result_flag else None
        else:
            self.y_diff = abs(self.y_pred - self.y_test)
            self.y_diff_n = (len(self.y_test)-sum(self.y_diff))/len(self.y_test)*100

    def prepare_data(self, sds):
        self.get_data()
        self.set_column_names()
        self.make_data_all_numerical()
        self.split_data_into_train_test(sds)

    def training_and_testing(self, n):
        self.build_mldl_model(n)
        self.training_mldl_model()
        self.test_mldl_model()
        self.evaluate_the_result(n)

    def corr(self):
        self.get_data()
        self.set_column_names()
        self.make_data_all_numerical()
        pearson_result = self.data.corr(method ='pearson')
        kendall_result = self.data.corr(method='kendall')
        return pearson_result, kendall_result

#     def single_run(self, n):
#         self.prepare_data(sds)
#         self.training_and_testing(n)
        
    def single_run(self, n, sds):
        # n: choosing algorithm
        # sds: turn on or off data scalling
        self.prepare_data(sds)
        if n == 1:
            self.training_and_testing('1.0')
        elif n == 2:
            self.training_and_testing('2.0')
        elif n == 3:
            self.training_and_testing('3.0')
        elif n == 4:
            self.training_and_testing('4.0')
        else:
            print('input errer')
        return self.y_diff_n

## Correlation Check (corr)

In [25]:
ao = algorithmOperation()
p, k = ao.corr()
p.to_csv("q2data/result_p.csv")
k.to_csv("q2data/result_k.csv")

## Standard Scaler

### Function to execute algorithm (Scaler)

In [26]:
def sr(n, sds, times):
    result = []
    for i in range(0, times):
        ao = algorithmOperation()
        result.append(ao.single_run(n, sds))
        del ao
    mean = stt.mean(result)
#     stdev = stt.stdev(result)
#     var = stt.variance(result)
    return mean

### Result

In [33]:
times = 10
unscaled = [sr(1, False, times), sr(2, False, times), sr(3, False, times), sr(4, False, times)]
scaled = [sr(1, True, times), sr(2, True, times), sr(3, True, times), sr(4, True, times)]
algorithm = ['DT', 'RF', 'KNN', 'SVC']
diff = [scaled[i]-unscaled[i] for i in range(0, len(algorithm))]
result = pd.DataFrame(data=[algorithm, unscaled, scaled, diff], index=['algorithm', 'unscaled', 'scaled', 'diff(s-uns)'])
print(result)

                     0          1          2          3
algorithm           DT         RF        KNN        SVC
unscaled     91.608392  93.356643  74.125874  62.237762
scaled       91.608392  93.776224  97.202797  97.202797
diff(s-uns)        0.0    0.41958  23.076923  34.965035
