In [2]:
from sklearn import tree
import pandas as pd
import os

In [34]:
#Rename columns to show what the values represent.
df = pd.read_csv(os.path.join("Data/bc.csv"))
df = df.rename(index=str, columns={"Column1": "Sample", "Column2": "Clump Thickness","Column3": "Uniformity of Cell Size",
                              "Column4": "Uniformity of Cell Shape", "Column5": "Marginal Adhesion","Column6": "Single Epithelial Cell Size"
                              ,"Column7": "Bare Nuclei", "Column8": "Bland Chromatin","Column9": "Normal Nucleoli", "Column10":"Mitoses", "Column11":"Class"})
df.head()

Unnamed: 0,Sample,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
#Make sure all values are numerical for machine learning model
df['Bare Nuclei'] = (df['Bare Nuclei'] !='n').astype(int)
df.dtypes
df2 = df


In [6]:
#The goal is to predict weather the Breast Cancer is benign or malignant.
target = df["Class"]
target_names = ["benign", "malignent"]

In [7]:
#Drop the Class column from all other features.
data = df.drop("Class", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Sample,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,1000025,5,1,1,1,2,1,3,1,1
1,1002945,5,4,4,5,7,1,3,2,1
2,1015425,3,1,1,1,2,1,3,1,1
3,1016277,6,8,8,1,3,1,3,7,1
4,1017023,4,1,1,3,2,1,3,1,1


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=33)

In [25]:
#Decision Tree Classifier model recieved a score of around 91% accuracy.
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9142857142857143

In [26]:
#Random Forest Classifier model recieved a score of 96% accuracy!
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)
# rf.predict(X_test[3])

0.96

In [27]:
#List the most important features that determine the models outcome.
#We can see that 'Uniformity of Cell Size', and 'Uniformity of Cell Shape' are the top two.
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.24158591609577174, 'Uniformity of Cell Shape'),
 (0.20410491445955128, 'Uniformity of Cell Size'),
 (0.15680602674980734, 'Bland Chromatin'),
 (0.15154062934281717, 'Single Epithelial Cell Size'),
 (0.08801426739853069, 'Normal Nucleoli'),
 (0.08491033530803659, 'Clump Thickness'),
 (0.039734786282513215, 'Marginal Adhesion'),
 (0.02476223149018399, 'Sample'),
 (0.008540892872787923, 'Mitoses'),
 (0.0, 'Bare Nuclei')]

In [28]:
#Drop the least important features.
X = df2.drop(["Class","Bare Nuclei","Mitoses","Sample","Marginal Adhesion","Clump Thickness","Normal Nucleoli","Single Epithelial Cell Size", "Bland Chromatin"], axis=1)
y = df2["Class"]
print(X.shape, y.shape)

#Split the data for training and testing.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3, stratify=y)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

(699, 2) (699,)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
#Fit the training data to the Logisitic Regression model.
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [30]:
#We recieve around 94% accuracy with top two features compared to only 64% using all of the features.
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9465648854961832
Testing Data Score: 0.9428571428571428


In [33]:
pd.DataFrame({"Prediction": model_predict
              , "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,4,4
1,4,4
2,2,2
3,2,2
4,2,2
5,2,2
6,2,2
7,4,4
8,2,2
9,2,2


In [None]:
#We see the Random Forest Classifier model gave us the best results for this specific classification problem.
#The next step would would be to build a Neural Network to see if the accuracy can be improved.