### Importing the libraries needed to complete the assignment  

In [2]:
import numpy as np 
import pandas as pd 
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier

#Reading in the CSV file, ensuring the missing values are recognized when reading in the file and naming the columns 
#Viewing the first couple of columns of the data to endure colum names appear and see if missing values are recognized 

In [17]:
missing_values = ["n/a","na",'?']
column_names=['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity'] 

#reading in the csv file 
data = pd.read_csv(r"ADD_YOUR_FILENAME_HERE.csv", na_values = missing_values,names=column_names, header = None)

#displaying the first couple of rows
data.head()


Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,yes
1,4.0,43.0,1.0,1.0,,yes
2,5.0,58.0,4.0,5.0,3.0,yes
3,4.0,28.0,1.0,1.0,3.0,no
4,5.0,74.0,1.0,5.0,,yes


#### Preprocessing the Data: Dropping the bi-rads column, dealing with missing values..(I used the median of eaxh column to fill in the missing  values of each column) and converting the content in the severity column to numeric (1 & 0)

In [18]:
# preprocessing the data 
#dropping the BI-RADS column 
data1 = data.drop(columns="BI-RADS")

#checking to ensure the bi-rads column has been dropped 
data1.head(10)

#dealing with the missing values by using the median values in each column 
data1[data1.isnull().any(axis=1)]

data2=data1.fillna(data1.median())

data2.isnull()

# converting the severity column to 1 and 0
data2['Severity'] = data2['Severity'].map({'yes': 1, 'no': 0 })

#displaying output after cleaning/preprocessing the data 
data2.head()


Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67.0,3.0,5.0,3.0,1
1,43.0,1.0,1.0,3.0,1
2,58.0,4.0,5.0,3.0,1
3,28.0,1.0,1.0,3.0,0
4,74.0,1.0,5.0,3.0,1


#### Defining the column we are going to be predicting and normalizing the data 

In [6]:
#identifying the column we are predicting which is severity
target=['Severity']
not_target=['Age','Shape', 'Margin', 'Density']
 
#normalizing the data 
Y=data2[target]

normX=data2[not_target]

normX.head()
Y.head()

norm=Normalizer().fit(normX)
X = norm.transform(normX)



##### Splitting the dataset into training set and test test.  Assigning test set 25%.  

In [9]:
#splitting the dataset into training set and test set.  Setting aside 25% for test set and 75% for training set

x_train,x_test, y_train,y_test= train_test_split(X,Y,test_size=0.25, train_size=0.75, random_state=1)

### Decision Tree Model 

In [10]:
#Decision tree & k-fold cross validation
dtree = DecisionTreeClassifier()

# training the model
dtree = dtree.fit(x_train,y_train)

#predicting the test dataset value
y_pred = dtree.predict(x_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.7468879668049793


##### Decision tree model accuracy result was 74.69% as seen above 

#####  Running Decision Model again using the K-Fold Cross validation to measure accuracy of 10


In [11]:
#using cross_val_score
dtree=DecisionTreeClassifier()
treeresult= cross_val_score(dtree,X,Y, cv=10)
print(np.mean(treeresult))

0.7585375972146862


##### As you will see above, the result is 75.85%  thus indicating that the k-fold cross validation improved the results slightly 

### Random Forest Model 

In [12]:
#RANDOM FOREST
rf=RandomForestClassifier(n_estimators=10, max_depth=2)
rfcross= cross_val_score(rf,X,np.ravel(Y), cv=10)
print(np.mean(treeresult))


0.7585375972146862


#####  As you see above, the result of the random forest using the k fold cross validation is 75.85%

### KNN Model 

In [14]:
# KNN model 
#creating a for loop k to run values 1 to 50:
# the range is set to 51 because knn will give you 1 less result so in order to get the loop to run and have 50 k's
# I needed to increase k to 51

for k in range(1, 51):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn_res= cross_val_score(knn,X,np.ravel(Y), cv=10)
    print("k:",k,"Acc:",np.mean(knn_res))

k: 1 Acc: 0.6909956592512209
k: 2 Acc: 0.6916788750226079
k: 3 Acc: 0.7512011665762344
k: 4 Acc: 0.7282276406221739
k: 5 Acc: 0.7625196690179056
k: 6 Acc: 0.7594054078495207
k: 7 Acc: 0.772959169831796
k: 8 Acc: 0.7657319361548199
k: 9 Acc: 0.7710167977934527
k: 10 Acc: 0.7793076279616566
k: 11 Acc: 0.7813043724000723
k: 12 Acc: 0.7782549963827092
k: 13 Acc: 0.7823896726351963
k: 14 Acc: 0.7771372535720744
k: 15 Acc: 0.7771806610598662
k: 16 Acc: 0.776171663049376
k: 17 Acc: 0.7772350334599385
k: 18 Acc: 0.7834857117019353
k: 19 Acc: 0.7740229924036897
k: 20 Acc: 0.7802951483089167
k: 21 Acc: 0.7782661873756557
k: 22 Acc: 0.7761609242177608
k: 23 Acc: 0.7772135557967083
k: 24 Acc: 0.7772135557967083
k: 25 Acc: 0.7793512615301139
k: 26 Acc: 0.7783095948634473
k: 27 Acc: 0.7772352595406041
k: 28 Acc: 0.7741100334599385
k: 29 Acc: 0.7762043317055526
k: 30 Acc: 0.7751517001266052
k: 31 Acc: 0.7741641797793453
k: 32 Acc: 0.7731115482003978
k: 33 Acc: 0.7741641797793453
k: 34 Acc: 0.77414247

##### The result of the of the knn model when the accuracy (k=10) is 77.93%

##### The best performance I could get out of KNN was K=18 and the result was 78.34%

### Naive Bayes Model 

In [15]:
#Naive Bayes model
new1 = MultinomialNB()
bayes= cross_val_score(new1,X,np.ravel(Y), cv=10)
print(np.mean(treeresult))


0.7585375972146862


###  The result of the model is 75.85%