In [4]:
#import packages 

import pandas as pd

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Read in Data

In [8]:
# read in data

url = "https://raw.githubusercontent.com/ali-rivera/WiDS24_Coding101/main/breast-cancer-wisconsin-data.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Look at data

In [11]:
# look at column names/spread
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,569.0,30371830.0,125020600.0,8670.0,869218.0,906024.0,8813129.0,911320500.0
radius_mean,569.0,14.12729,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.28965,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.96903,24.29898,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.8891,351.9141,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636028,0.01406413,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.05281276,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.08879932,0.07971981,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.04891915,0.03880284,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.1811619,0.02741428,0.106,0.1619,0.1792,0.1957,0.304


In [15]:
df.diagnosis.value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

## Preproccessing

In [18]:
# turn diagnosis into a boolean 
df['cancer'] = df.diagnosis.replace(['B', 'M'], [False, True])

In [19]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,cancer
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,True
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,True
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,True
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,True
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,True


In [20]:
df.drop(['diagnosis', 'id'], axis=1, inplace = True)

In [25]:
#split

X_train, X_test, y_train, y_test = train_test_split(df.drop('cancer', axis=1), df['cancer'], test_size=0.33, random_state=42)

In [None]:
MMscale = MinMaxScaler()

In [54]:
X_train_scale = MMscale.fit_transform(X_train)
X_test_scale = MMscale.transform(X_test)

## Model building

### Logistic Regression

<img src="https://miro.medium.com/v2/resize:fit:1400/1*KZQYpR-aWsSF2Zl7JFRI5A.png" width = 700>

[image source](https://ai.plainenglish.io/why-is-logistic-regression-called-regression-if-it-is-a-classification-algorithm-9c2a166e7b74)

A logistic regression works by fitting a simoid function to calculate the probability of an observation being in a spcified class for the target variable. This probability is compared to a threshold value, and if the probability is above the threshold is will be categorized as a positive outcome. 

For more information on logistic regressions, check out this [IBM page](https://www.ibm.com/topics/logistic-regression#:~:text=Logistic%20regression%20estimates%20the%20probability,given%20dataset%20of%20independent%20variables).

In [56]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
#y_pred = logreg.predict(X_test)
logreg.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9574468085106383

In [33]:
(y_pred == y_test).sum()/len(y_test)

0.9574468085106383

In [39]:
pd.DataFrame(logreg.predict_proba(X_test), columns = ["False", "True"])

Unnamed: 0,False,True
0,8.423000e-01,0.157700
1,6.745271e-10,1.000000
2,1.529971e-03,0.998470
3,9.960689e-01,0.003931
4,9.991234e-01,0.000877
...,...,...
183,1.299760e-04,0.999870
184,2.823872e-07,1.000000
185,1.766818e-02,0.982332
186,2.735051e-01,0.726495


### kNN (k Nearest Neighbor)

<img src="https://miro.medium.com/v2/resize:fit:810/0*rc5_e6-6AHzqppcr" width = 400>

[image source](https://medium.com/analytics-vidhya/k-nearest-neighbor-the-maths-behind-it-how-it-works-and-an-example-f1de1208546c)


kNN classifies a new observation by finding the k nearest neighbors to the point in n-dimensional space. 

k is the number of neighbors, and n is the number of features in your data. 

kNN uses the majority vote of the k neighbors to classify the point. The model can be tuned by choosing the k value to produce the best fit. 

Choosing too small of a k will overfit your model, causing it to be too specific to the training data and not generalizable to new data. Choosing too large of a k will underfit the model and may not capture the important trends in the data. Overfitting and underfitting both result in poor model performance.

In [43]:
kNN = KNeighborsClassifier()
kNN.fit(X_train, y_train)
y_predkNN = kNN.predict(X_test)

In [44]:
(y_predkNN == y_test).sum()/len(y_test)

0.9521276595744681

### Decision Tree

<img src="https://www.mastersindatascience.org/wp-content/uploads/sites/54/2022/05/tree-graphic.jpg" width = 500>

[image source](https://www.mastersindatascience.org/learning/machine-learning-algorithms/decision-tree/)

A decision tree model creates "splits" for predictors in the model to serpate 

In [49]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

0.8936170212765957

### Random Forest

<img src="https://miro.medium.com/v2/resize:fit:1400/1*jE1Cb1Dc_p9WEOPMkC95WQ.png" width = 500>

[image source](https://medium.com/@roiyeho/random-forests-98892261dc49)

In [51]:
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)
rforest.score(X_test, y_test)

0.9574468085106383