# Classifying WBCD using logistic regression

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

df = pd.read_csv("wbcd.csv",index_col=0)
df

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2,1,1,1,2
841769,2,1,1,1,2,1,1,1,1,2
888820,5,10,10,3,7,3,8,10,2,4
897471,4,8,6,4,3,4,10,6,1,4


> Changing classes to 0 and 1 to be used in logistic regression

In [None]:
df['Class'] = df['Class'].replace({2:0, 4:1})
df

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,0
1002945,5,4,4,5,7,10,3,2,1,0
1015425,3,1,1,1,2,2,3,1,1,0
1016277,6,8,8,1,3,4,3,7,1,0
1017023,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2,1,1,1,0
841769,2,1,1,1,2,1,1,1,1,0
888820,5,10,10,3,7,3,8,10,2,1
897471,4,8,6,4,3,4,10,6,1,1


### Method - 1 : Removing missing data
> Removing missing data

In [None]:
df_non_empty = df.copy(deep=True)
df_non_empty = df_non_empty[df_non_empty['Bare Nuclei'] != '?'].astype({'Bare Nuclei': 'int64'})
df_non_empty

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,0
1002945,5,4,4,5,7,10,3,2,1,0
1015425,3,1,1,1,2,2,3,1,1,0
1016277,6,8,8,1,3,4,3,7,1,0
1017023,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2,1,1,1,0
841769,2,1,1,1,2,1,1,1,1,0
888820,5,10,10,3,7,3,8,10,2,1
897471,4,8,6,4,3,4,10,6,1,1


> Summarising variables and effect

In [None]:
import statsmodels.api as sm

y = df_non_empty[['Class']]
X = df_non_empty[['Clump Thickness'] + ['Uniformity of Cell Size'] + ['Uniformity of Cell Shape'] + ['Marginal Adhesion'] + ['Single Epithelial Cell Size'] + ['Bare Nuclei'] + ['Bland Chromatin'] + ['Normal Nucleoli'] + ['Mitoses']]

logit = sm.Logit(y, sm.add_constant(X)).fit()
logit.summary2()

Optimization terminated successfully.
         Current function value: 0.075321
         Iterations 10


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.884
Dependent Variable:,Class,AIC:,122.8882
Date:,2021-02-24 16:02,BIC:,168.1531
No. Observations:,683,Log-Likelihood:,-51.444
Df Model:,9,LL-Null:,-442.18
Df Residuals:,673,LLR p-value:,2.0773e-162
Converged:,1.0000,Scale:,1.0
No. Iterations:,10.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-10.1039,1.1749,-8.5999,0.0000,-12.4067,-7.8012
Clump Thickness,0.5350,0.1420,3.7672,0.0002,0.2567,0.8134
Uniformity of Cell Size,-0.0063,0.2091,-0.0300,0.9760,-0.4161,0.4035
Uniformity of Cell Shape,0.3227,0.2306,1.3994,0.1617,-0.1293,0.7747
Marginal Adhesion,0.3306,0.1235,2.6783,0.0074,0.0887,0.5726
Single Epithelial Cell Size,0.0966,0.1566,0.6171,0.5372,-0.2103,0.4036
Bare Nuclei,0.3830,0.0938,4.0815,0.0000,0.1991,0.5670
Bland Chromatin,0.4472,0.1714,2.6093,0.0091,0.1113,0.7831
Normal Nucleoli,0.2130,0.1129,1.8873,0.0591,-0.0082,0.4343


> Splitting into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,df_non_empty.Class,test_size = 0.3)
type(X_test)

NameError: ignored

> Training logistic regression classifier model

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train,y_train)

model = lr.predict(X_test)
model

array([1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0])

> Accuracy

In [None]:
lr.score(X_test, y_test)

0.9512195121951219

### Method - 2 : Replacing missing data with 1

> Replacing

In [None]:
df_replaced = df.copy(deep=True)
df_replaced['Bare Nuclei']  = df_replaced['Bare Nuclei'].replace({'?': 1})
df_replaced = df_replaced.astype({'Bare Nuclei': 'int64'})
df_replaced

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,0
1002945,5,4,4,5,7,10,3,2,1,0
1015425,3,1,1,1,2,2,3,1,1,0
1016277,6,8,8,1,3,4,3,7,1,0
1017023,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2,1,1,1,0
841769,2,1,1,1,2,1,1,1,1,0
888820,5,10,10,3,7,3,8,10,2,1
897471,4,8,6,4,3,4,10,6,1,1


> Splitting second set

In [None]:
y_1 = df_replaced[['Class']]
X_1 = df_replaced[['Clump Thickness'] + ['Uniformity of Cell Size'] + ['Uniformity of Cell Shape'] + ['Marginal Adhesion'] + ['Single Epithelial Cell Size'] + ['Bare Nuclei'] + ['Bland Chromatin'] + ['Normal Nucleoli'] + ['Mitoses']]

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1,df_replaced.Class,test_size = 0.3)
y_test_1

code
1135090    0
142932     1
1333063    0
1202253    0
888820     1
          ..
640712     0
1099510    1
1318169    1
677910     0
1105257    1
Name: Class, Length: 210, dtype: int64

> Training

In [None]:
lr_1 = LogisticRegression()

lr_1.fit(X_train_1,y_train_1)

model = lr_1.predict(X_test_1)
model

array([0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1])

> Accuracy

In [None]:
lr_1.score(X_test_1, y_test_1)

0.9666666666666667