## Import Data

In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
df = pd.read_csv("./dataset/Cancer_Data.csv")
df = df.drop(["id","Unnamed: 32"],axis = 'columns')
df['diagnosis'].replace(['B', 'M'],[0, 1], inplace=True)
df.head(3)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


## 3 base model prepare

In [153]:
# Log Reg
def logR_tranAndPredict(X,y,model_dict):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=43)
    logclassifier = LogisticRegression()
    parameters = {
    'C': [0.1, 1, 10]
}
    predictor_log = GridSearchCV(logclassifier, parameters, cv=5)
    # getting the result
    y_pred = predictor_log.predict(X_val)
    accuracy_log_reg = accuracy_score(y_val, y_pred)
    model_dict['logistic_regression'] = accuracy_log_reg


In [157]:
#SVC
def Svc_tranAndPredict(X,y,model_dict):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=43)
    
    svc = SVC()
    parameters = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}
    predictor_svc = GridSearchCV(svc, parameters, cv=5)
    # getting the result
    y_pred = predictor_svc.predict(X_val)
    accuracy_svc = accuracy_score(y_val, y_pred)
    model_dict['SVC'] = accuracy_svc

In [155]:
#Random Forest Classifier
def Rfc_tranAndPredict(X,y,model_dict):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=43)
    classifier = RandomForestClassifier(random_state=42)
    predictor = classifier.fit(X_train, y_train)
    y_pred = predictor.predict(X_val)
    accuracy_rfc = accuracy_score(y_val, y_pred)
    model_dict['random_forest_classifier'] = accuracy_rfc

In [158]:
# X, y = df.drop('diagnosis', axis=1), df[['diagnosis']]
X, y = df[["radius_mean"]], df[['diagnosis']]
model_dict = {}

logR_tranAndPredict(X,y,model_dict)
Svc_tranAndPredict(X,y,model_dict)
Rfc_tranAndPredict(X,y,model_dict)

  y = column_or_1d(y, warn=True)


NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
model_accuracies_df = pd.DataFrame(columns=['Model', 'Accuracy'])
model_accuracies_df['Model'] = model_dict.keys()
model_accuracies_df['Default model'] = model_dict.values()
model_accuracies_df

Unnamed: 0,Model,Accuracy
0,logistic_regression,0.893617
1,SVC,0.909574
2,random_forest_classifier,0.797872


Feature Experiment 1: Feature Scaling

In [142]:
X, y = df[["radius_mean"]]**2, df[['diagnosis']]
model_dict = {}

logR_tranAndPredict(X,y,model_dict)
Svc_tranAndPredict(X,y,model_dict)
Rfc_tranAndPredict(X,y,model_dict)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  predictor = classifier.fit(X_train, y_train)


In [143]:
model_accuracies_df = pd.DataFrame(columns=['Model', 'Accuracy'])
model_accuracies_df['Model'] = model_dict.keys()
model_accuracies_df['Accuracy'] = model_dict.values()
model_accuracies_df

Unnamed: 0,Model,Accuracy
0,logistic_regression,0.904255
1,SVC,0.914894
2,random_forest_classifier,0.797872


Feature Experiment 2: Add New Features (add texture_mean)

In [128]:
# X, y = df.drop('diagnosis', axis=1), df[['diagnosis']]
X, y = df[["radius_mean","texture_mean"]], df[['diagnosis']]

model_dict = {}

logR_tranAndPredict(X,y,model_dict)
Svc_tranAndPredict(X,y,model_dict)
Rfc_tranAndPredict(X,y,model_dict)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  predictor = classifier.fit(X_train, y_train)


In [129]:
model_accuracies_df = pd.DataFrame(columns=['Model', 'Accuracy'])
model_accuracies_df['Model'] = model_dict.keys()
model_accuracies_df['Accuracy'] = model_dict.values()
model_accuracies_df

Unnamed: 0,Model,Accuracy
0,logistic_regression,0.914894
1,SVC,0.888298
2,random_forest_classifier,0.87234


## Experiment 3: Transform Features via PCA

### Log

### Svc

In [147]:
from sklearn.decomposition import PCA
X, y = df.drop('radius_mean', axis=1), df[['diagnosis']]
pipeline = Pipeline([
    ('pca', PCA()),
    ('svc', SVC())
])

parameters = {
    'pca__n_components': [2, 5, 10],
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}
grid_search = GridSearchCV(pipeline, parameters, cv=5)


array([[1160.1425737 , -293.91754364],
       [1269.12244319,   15.63018184],
       [ 995.79388896,   39.15674324],
       ...,
       [ 314.50175618,   47.55352518],
       [1124.85811531,   34.12922497],
       [-771.52762188,  -88.64310636]])

Experiment 4: Preprocessing Features (Normalize with min max scaling)

In [130]:
from sklearn.preprocessing import MinMaxScaler
#fit and transforming MinMaxScaler the dataframe 
mm = MinMaxScaler()
df_new_mm = pd.DataFrame(mm.fit_transform(df), columns=df.columns)
X, y = df_new_mm[["radius_mean"]], df_new_mm[['diagnosis']]

model_dict = {}

logR_tranAndPredict(X,y,model_dict)
Svc_tranAndPredict(X,y,model_dict)
Rfc_tranAndPredict(X,y,model_dict)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  predictor = classifier.fit(X_train, y_train)


In [131]:
model_accuracies_df = pd.DataFrame(columns=['Model', 'Accuracy'])
model_accuracies_df['Model'] = model_dict.keys()
model_accuracies_df['Accuracy'] = model_dict.values()
model_accuracies_df

Unnamed: 0,Model,Accuracy
0,logistic_regression,0.898936
1,SVC,0.909574
2,random_forest_classifier,0.803191


Experiment 5: Noisy Indicators
