In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# Importing Dataset


In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer/breast canser/breast-cancer-wisconsin.data',names = [
    'Sample code number',
    'Clump Thickness',
    'Uniformity of Cell Size',
    'Uniformity of Cell Shape',
    'Marginal Adhesion',
    'Single Epithelial Cell Size',
    'Bare Nuclei',
    'Bland Chromatin',
    'Normal Nucleoli',
    'Mitoses',
    'Class'
])




data.head()

In [None]:
data.dtypes

# Preprocessing


In [None]:


def is_non_numrix(x):
    return not x.isnumeric()



In [None]:
mask = data['Bare Nuclei'].apply(is_non_numrix)

data_non_numeric = data[mask]

data_non_numeric.head()

In [None]:
data_numeric = data[~mask]

In [None]:
print(len(data))
print(len(data_numeric))

In [None]:
data_numeric.dtypes

In [None]:
data_numeric['Bare Nuclei'] = data_numeric['Bare Nuclei'].astype('int64')

In [None]:
data_input = data_numeric.drop(columns = ['Sample code number','Class'])
data_output = data_numeric['Class']

In [None]:
data_input.head()

In [None]:
data_output.head()

# Spliting data


In [None]:
from sklearn.model_selection import train_test_split
x,x_test, y,  y_test = train_test_split(data_input, data_output, test_size=1/3, random_state=2)

x_train,x_val, y_train,  y_val = train_test_split(x, y, test_size=1/3, random_state=2)

# Comparison between algorithms


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report,fbeta_score




models = {
    "LR": LogisticRegression(),
    "RF": RandomForestClassifier(n_estimators=100,max_depth=7),
    "DT": DecisionTreeClassifier(),
    "GradientBoosting"  :  GradientBoostingClassifier(n_estimators=100,max_depth=7),
    #"XGB": XGBClassifier(n_estimators=100),
    "KNN" : KNeighborsClassifier(),
    "SVC" : SVC()
}



for name, model in models.items():
    print(f'Training Model {name} \n-----------------------------------------------')
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f'Training Accuracy: {model.score(x_train, y_train)}')
    print(f'Testing Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Testing Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')


# Training model by DT


In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=3,random_state = 2)
model.fit(x_train, y_train)

In [None]:
y_pred_train =model.predict(x_train)
y_pred_val =model.predict(x_val)

In [None]:
from sklearn.metrics import  accuracy_score
print(accuracy_score(y_train,y_pred_train))
print(accuracy_score(y_val,y_pred_val))

# Choose Best Max_depth


In [None]:


max_depth_values = [1,2,3,4,5,6,7,8]
train_accuracy_values =[]
val_accuracy_values = []
for max_depth_val in max_depth_values:
    model = DecisionTreeClassifier(max_depth=max_depth_val,random_state = 2)
    model.fit(x_train, y_train)
    y_pred_train =model.predict(x_train)
    y_pred_val =model.predict(x_val)
    acc_train=accuracy_score(y_train,y_pred_train)
    acc_val=accuracy_score(y_val,y_pred_val)
    train_accuracy_values.append(acc_train)
    val_accuracy_values.append(acc_val)



In [None]:


import matplotlib.pyplot as plt
%matplotlib inline
 
plt.plot(max_depth_values,train_accuracy_values,label = 'acc train')
plt.plot(max_depth_values,val_accuracy_values,label = 'val train')
plt.legend()
plt.grid(axis='both')
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.title('Effect on max_depth and accuracy')
plt.show()



In [None]:


final_model = DecisionTreeClassifier(max_depth=3,random_state = 0)
final_model.fit(x_train, y_train)



In [None]:


y_pred_test = final_model.predict(x_test)
print(accuracy_score(y_test,y_pred_test))



# Visualizing Tree


In [None]:


from sklearn import tree
plt.figure(figsize=(20,15))
tree.plot_tree(final_model,
        feature_names=[
    'Clump Thickness',
    'Uniformity of Cell Size',
    'Uniformity of Cell Shape',
    'Marginal Adhesion',
    'Single Epithelial Cell Size',
    'Bare Nuclei',
    'Bland Chromatin',
    'Normal Nucleoli',
    'Mitoses'
    ],
    class_names =['benign','malignant'],
    filled=True
)
plt.show()



# feature_importances


In [None]:
final_model.feature_importances_

In [None]:


feature_names=[
    'Clump Thickness',
    'Uniformity of Cell Size',
    'Uniformity of Cell Shape',
    'Marginal Adhesion',
    'Single Epithelial Cell Size',
    'Bare Nuclei',
    'Bland Chromatin',
    'Normal Nucleoli',
    'Mitoses'
    ]
plt.bar(feature_names,final_model.feature_importances_)
plt.xlabel('feauters')
plt.xticks(rotation=90)
plt.ylabel('importance')

plt



# Model evaluation


In [None]:
from sklearn.metrics import confusion_matrix

# predict on test data
y_pred_test = final_model.predict(x_test)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# print confusion matrix
print('Confusion Matrix:\n', cm)

In [None]:


import seaborn as sns

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')
plt.title('Confusion Matrix for Final Model')
plt.show()



In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred_test)

accuracy = accuracy_score(y_test, y_pred_test)
precision = cm[1,1]/(cm[0,1]+cm[1,1])
recall = cm[1,1]/(cm[1,0]+cm[1,1])
f1_score = 2*precision*recall/(precision+recall)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1_score)
print('Classification Report:\n', report)