Reference

    https://en.wikipedia.org/wiki/Diagnostic_odds_ratio
    http://www.science.smith.edu/~jcrouser/SDS293/labs/lab4-py.html



In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("./data/BreastCancer.csv")

In [3]:
data.head()

Unnamed: 0,Id,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,benign
1,1002945,5,4,4,5,7,10.0,3,2,1,benign
2,1015425,3,1,1,1,2,2.0,3,1,1,benign
3,1016277,6,8,8,1,3,4.0,3,7,1,benign
4,1017023,4,1,1,3,2,1.0,3,1,1,benign


In [4]:
labels = data['Class']
data = data.drop(columns=['Id'])

# Data Description

In [6]:
data.head()

Unnamed: 0,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,benign
1,5,4,4,5,7,10.0,3,2,1,benign
2,3,1,1,1,2,2.0,3,1,1,benign
3,6,8,8,1,3,4.0,3,7,1,benign
4,4,1,1,3,2,1.0,3,1,1,benign


In [7]:
labels.head()

0    benign
1    benign
2    benign
3    benign
4    benign
Name: Class, dtype: object

In [8]:
data.isnull().count()

Cl.thickness       699
Cell.size          699
Cell.shape         699
Marg.adhesion      699
Epith.c.size       699
Bare.nuclei        699
Bl.cromatin        699
Normal.nucleoli    699
Mitoses            699
Class              699
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
Cl.thickness       699 non-null int64
Cell.size          699 non-null int64
Cell.shape         699 non-null int64
Marg.adhesion      699 non-null int64
Epith.c.size       699 non-null int64
Bare.nuclei        683 non-null float64
Bl.cromatin        699 non-null int64
Normal.nucleoli    699 non-null int64
Mitoses            699 non-null int64
Class              699 non-null object
dtypes: float64(1), int64(8), object(1)
memory usage: 54.7+ KB


In [10]:
data.describe()

Unnamed: 0,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses
count,699.0,699.0,699.0,699.0,699.0,683.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.643857,2.438364,3.053634,1.715078
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


# Handling Missing Values

In [11]:
data.isnull().sum()

Cl.thickness        0
Cell.size           0
Cell.shape          0
Marg.adhesion       0
Epith.c.size        0
Bare.nuclei        16
Bl.cromatin         0
Normal.nucleoli     0
Mitoses             0
Class               0
dtype: int64

In [12]:
data =data.fillna(0)

In [13]:
data.isnull().sum()

Cl.thickness       0
Cell.size          0
Cell.shape         0
Marg.adhesion      0
Epith.c.size       0
Bare.nuclei        0
Bl.cromatin        0
Normal.nucleoli    0
Mitoses            0
Class              0
dtype: int64

In [14]:
data.keys()

Index(['Cl.thickness', 'Cell.size', 'Cell.shape', 'Marg.adhesion',
       'Epith.c.size', 'Bare.nuclei', 'Bl.cromatin', 'Normal.nucleoli',
       'Mitoses', 'Class'],
      dtype='object')

In [15]:
data.columns=['Thickness','Size','Shape','Adhesion','Epith_Size','Bare_Nuclei','Cromatin','Normal_Nucleoli','Mitoses','Label']


In [16]:
data.keys()

Index(['Thickness', 'Size', 'Shape', 'Adhesion', 'Epith_Size', 'Bare_Nuclei',
       'Cromatin', 'Normal_Nucleoli', 'Mitoses', 'Label'],
      dtype='object')

# Splitting data

In [17]:
xtrain, xtest, ytrain, ytest = train_test_split(data, labels, test_size = 0.3)

In [18]:
len(xtrain),len(xtest)

(489, 210)

# Training GLM

In [19]:
def info(model):
    l= [model.params , model.pvalues , model.model.endog_names ]
    dd={"Coefficeients":l[0],"p-Values":l[1],"Dependent variables":l[2]}
    for i in dd.keys():
        print (i)
        #print("\n")
        print(dd[i])
        print("\n")
        
    
    

In [20]:
def glm_model(xtrain,given_formula):
    clf = smf.glm(formula = given_formula , data = xtrain, family = sm.families.Binomial()).fit()
    print(clf.summary())
    info(clf)
    return clf
    

In [21]:
form = 'Label ~ Thickness+Size+Shape+Adhesion+Epith_Size+Bare_Nuclei+Cromatin+Normal_Nucleoli+Mitoses'
model = glm_model(xtrain,form)

                           Generalized Linear Model Regression Results                           
Dep. Variable:     ['Label[benign]', 'Label[malignant]']   No. Observations:                  489
Model:                                               GLM   Df Residuals:                      479
Model Family:                                   Binomial   Df Model:                            9
Link Function:                                     logit   Scale:                          1.0000
Method:                                             IRLS   Log-Likelihood:                -32.088
Date:                                   Tue, 29 Oct 2019   Deviance:                       64.177
Time:                                           23:12:05   Pearson chi2:                     213.
No. Iterations:                                        8                                         
Covariance Type:                               nonrobust                                         
                    

# Doing Predictions

In [22]:
pred_val = model.predict(xtest)

In [23]:
pred_val.keys()

Int64Index([485, 306, 352, 653, 341, 615, 360, 490, 206, 142,
            ...
            353, 599, 276, 168, 335, 175, 321, 302, 398, 296],
           dtype='int64', length=210)

In [24]:
def make_pred_vector(pred_val):
    arr= []
    for i in pred_val.keys():
        arr.append(pred_val[i])
    
    for i in range(0,len(arr)):
        if arr[i] > 0.5:
            arr[i]= "benign"
        else:
            arr[i] = "malignant"
    return arr

In [25]:
vector = make_pred_vector(pred_val)

In [26]:
vector

['benign',
 'benign',
 'benign',
 'benign',
 'benign',
 'benign',
 'malignant',
 'benign',
 'malignant',
 'malignant',
 'benign',
 'malignant',
 'benign',
 'malignant',
 'malignant',
 'benign',
 'benign',
 'benign',
 'malignant',
 'malignant',
 'benign',
 'malignant',
 'malignant',
 'benign',
 'malignant',
 'benign',
 'benign',
 'benign',
 'benign',
 'benign',
 'benign',
 'malignant',
 'benign',
 'benign',
 'malignant',
 'malignant',
 'malignant',
 'malignant',
 'benign',
 'malignant',
 'benign',
 'benign',
 'malignant',
 'benign',
 'benign',
 'malignant',
 'benign',
 'malignant',
 'benign',
 'malignant',
 'malignant',
 'malignant',
 'malignant',
 'benign',
 'malignant',
 'benign',
 'malignant',
 'benign',
 'benign',
 'benign',
 'benign',
 'malignant',
 'benign',
 'malignant',
 'malignant',
 'malignant',
 'benign',
 'benign',
 'malignant',
 'benign',
 'benign',
 'malignant',
 'benign',
 'malignant',
 'benign',
 'benign',
 'benign',
 'benign',
 'benign',
 'malignant',
 'benign',
 'benig

# Doing Evaluation of Model

In [27]:
pred_lab = vector
print (confusion_matrix(ytest,pred_lab))

[[123   5]
 [  7  75]]


In [28]:
print (classification_report(ytest,pred_lab))

              precision    recall  f1-score   support

      benign       0.95      0.96      0.95       128
   malignant       0.94      0.91      0.93        82

    accuracy                           0.94       210
   macro avg       0.94      0.94      0.94       210
weighted avg       0.94      0.94      0.94       210



# Odd Ratio

In [30]:
(tn,fp,fn,tp) = confusion_matrix(ytest,pred_lab).ravel()

In [31]:
(tn,fp,fn,tp)

(123, 5, 7, 75)

In [32]:
dor = (float(tp)/fp) /  (float(fn)/tn)
x1 = 1.0/tp
x2 = 1.0/fp
x3 = 1.0/fn
x4 = 1.0/tn
se = np.sqrt(x1+x2+x3+x4)
log_dor1 = np.log(dor) - 1.96 *se
log_dor2 = np.log(dor) + 1.96 *se
log_dor1,log_dor2


(4.391288544201915, 6.757360270636713)