In [2]:
# Load some packages
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeClassifier 
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Read the data
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00529/diabetes_data_upload.csv')

# Organize our data
df.info()
# labels = df["Gender"].tolist()
feature_names = df.columns
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [3]:
# Categorical values to 0 or 1
lb_make = LabelEncoder()
for feature in feature_names[1:]:
    df[feature] = lb_make.fit_transform(df[feature])
df.head(5)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [5]:
# Get statistics/insights about the data

#men vs women
vecM = df["Gender"] == 1
vecW = df["Gender"] == 0
nM = sum(vecM)
nW = sum(vecW)
print("Men:", nM, "Women:", nW)
print("Positive:", sum(df["class"] == 1), "Negative:", sum(df["class"] == 0), len(df))

confusionmat = metrics.confusion_matrix(df["Gender"], df["class"])#.tolist()
print("  Pos\tNeg\nW",end=' ')
print(*confusionmat[0], sep='\t')
print("M",end=' ')
print(*confusionmat[1], sep='\t')

print("Class Distribution (Neg, Pos)")
print("- Men:", end = '\t')
print(*["%1.3f "% i for i in confusionmat[1]/sum(confusionmat[1])])
print("- Women:", end = '')
print(*["%1.3f "% i for i in confusionmat[0]/sum(confusionmat[0])])
print("Gender Distribution (Women, Men)")
print("- Positive:", end = ' ')
print(*["%1.3f "% i for i in confusionmat[:,1]/sum(confusionmat[:,1])])
print("- Negative:", end = ' ')
print(*["%1.3f "% i for i in confusionmat[:,0]/sum(confusionmat[:,0])])

Men: 328 Women: 192
Positive: 320 Negative: 200 520
  Pos	Neg
W 19	173
M 181	147
Class Distribution (Neg, Pos)
- Men:	0.552  0.448 
- Women:0.099  0.901 
Gender Distribution (Women, Men)
- Positive: 0.541  0.459 
- Negative: 0.095  0.905 


In [7]:
print(feature_names.tolist())
feature_cols = feature_names[:5] # Select the first 5 features only
# you want all rows, and the feature_cols' columns
X = df.loc[:, feature_cols] # features
Y = df["class"]  # true label 0 or 1

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X,Y)
#Predict the response for test dataset
Ypred = clf.predict(X)

['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']


In [8]:
cmM = metrics.confusion_matrix(Y[vecM], Ypred[vecM])
cmW = metrics.confusion_matrix(Y[vecW], Ypred[vecW])
# Model Accuracy: how often is the classifier correct?
print("Accuracy Overall: %1.3f" % metrics.accuracy_score(Y, Ypred))
print("Accuracy for M: %1.3f" % metrics.accuracy_score(Y[vecM], Ypred[vecM]))
print("Accuracy for W %1.3f" % metrics.accuracy_score(Y[vecW], Ypred[vecW]))
print("False positive rate for M: %1.3f" % (cmM[0,1]/(cmM[0,1]+cmM[0,0])))
print("False positive rate for W: %1.3f" % (cmW[0,1]/(cmW[0,1]+cmW[0,0])))

Accuracy Overall: 0.973
Accuracy for M: 0.970
Accuracy for W 0.979
False positive rate for M: 0.011
False positive rate for W: 0.053


In [9]:
# To do random sampling
# g = dff.groupby('class')
# df = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))
#men vs women
# vecMnew = dfnew["Gender"] == 1
# vecWnew = dfnew["Gender"] == 0
# nMnew = sum(vecMnew)
# nWnew = sum(vecWnew)
# print("Men:", nMnew, "Women:", nWnew)
# print("Positive:", sum(dfnew["class"] == 1), "Negative:", sum(dfnew["class"] == 0), len(dfnew))
# confusionmatnew = metrics.confusion_matrix(dfnew["Gender"], dfnew["class"])#.tolist()
# print("  Pos\tNeg\nW",end=' ')
# print(*confusionmatnew[0], sep='\t')
# print("M",end=' ')
# print(*confusionmatnew[1], sep='\t')

# print("Class Distribution (Neg, Pos)")
# print("- Men:", end = '\t')
# print(*["%1.3f "% i for i in confusionmatnew[1]/sum(confusionmatnew[1])])
# print("- Women:", end = '')
# print(*["%1.3f "% i for i in confusionmatnew[0]/sum(confusionmatnew[0])])
# print("Gender Distribution (Women, Men)")
# print("- Positive:", end = ' ')
# print(*["%1.3f "% i for i in confusionmatnew[:,1]/sum(confusionmatnew[:,1])])
# print("- Negative:", end = ' ')
# print(*["%1.3f "% i for i in confusionmatnew[:,0]/sum(confusionmatnew[:,0])])

# Let's not use the "Gender" attribute!

In [11]:
feature_cols = feature_names[:5].tolist()
# Do not use the gender
feature_cols.pop(1)
print("Features used:", feature_cols)
X = df.loc[:, feature_cols] # you want all rows, and the feature_cols' columns
Y = df["class"]
# X = dfnew.loc[:, feature_cols] # you want all rows, and the feature_cols' columns
# Y = dfnew["class"]

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X,Y)
#Predict the response for test dataset
Ypred = clf.predict(X)

Features used: Age Polyuria Polydipsia sudden weight loss


In [12]:
cmM = metrics.confusion_matrix(Y[vecM], Ypred[vecM])
cmW = metrics.confusion_matrix(Y[vecW], Ypred[vecW])
# Model Accuracy: how often is the classifier correct?
print("Accuracy Overall: %1.3f" % metrics.accuracy_score(Y, Ypred))
print("Accuracy for M: %1.3f" % metrics.accuracy_score(Y[vecM], Ypred[vecM]))
print("Accuracy for W %1.3f" % metrics.accuracy_score(Y[vecW], Ypred[vecW]))
print("False positive rate for M: %1.3f" % (cmM[0,1]/(cmM[0,1]+cmM[0,0])))
print("False positive rate for W: %1.3f" % (cmW[0,1]/(cmW[0,1]+cmW[0,0])))

Accuracy Overall: 0.950
Accuracy for M: 0.960
Accuracy for W 0.932
False positive rate for M: 0.017
False positive rate for W: 0.053


In [13]:
# cmMnew = metrics.confusion_matrix(Y[vecMnew], Ypred[vecMnew])
# cmWnew = metrics.confusion_matrix(Y[vecWnew], Ypred[vecWnew])
# # Model Accuracy: how often is the classifier correct?
# print("Accuracy Overall: %1.3f" % metrics.accuracy_score(Y, Ypred))
# print("Accuracy for M: %1.3f" % metrics.accuracy_score(Y[vecMnew], Ypred[vecMnew]))
# print("Accuracy for W %1.3f" % metrics.accuracy_score(Y[vecWnew], Ypred[vecWnew]))
# print("False positive rate for M: %1.3f" % (cmMnew[0,1]/(cmMnew[0,1]+cmMnew[0,0])))
# print("False positive rate for W: %1.3f" % (cmWnew[0,1]/(cmWnew[0,1]+cmWnew[0,0])))

# Let's not use the "age" attribute!

In [14]:
feature_cols = feature_names[:5].tolist()
# Do not use the age
feature_cols.pop(0)
print("Features used:", feature_cols)
X = df.loc[:, feature_cols] # you want all rows, and the feature_cols' columns
Y = df["class"]
# X = dfnew.loc[:, feature_cols] # you want all rows, and the feature_cols' columns
# Y = dfnew["class"]

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X,Y)
#Predict the response for test dataset
Ypred = clf.predict(X)

Features used: ['Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss']


In [15]:
cmM = metrics.confusion_matrix(Y[vecM], Ypred[vecM])
cmW = metrics.confusion_matrix(Y[vecW], Ypred[vecW])
# Model Accuracy: how often is the classifier correct?
print("Accuracy Overall: %1.3f" % metrics.accuracy_score(Y, Ypred))
print("Accuracy for M: %1.3f" % metrics.accuracy_score(Y[vecM], Ypred[vecM]))
print("Accuracy for W %1.3f" % metrics.accuracy_score(Y[vecW], Ypred[vecW]))
print("False positive rate for M: %1.3f" % (cmM[0,1]/(cmM[0,1]+cmM[0,0])))
print("False positive rate for W: %1.3f" % (cmW[0,1]/(cmW[0,1]+cmW[0,0])))

Accuracy Overall: 0.894
Accuracy for M: 0.890
Accuracy for W 0.901
False positive rate for M: 0.127
False positive rate for W: 1.000


In [29]:
print("  Neg  Pos\n0 ", end='')
print(*cmW, sep='\n1 ')

  Neg  Pos
0 [ 0 19]
1 [  0 173]


In [30]:
print("  Neg  Pos\n0 ", end='')
print(*cmM, sep='\n1 ')

  Neg  Pos
0 [158  23]
1 [ 13 134]
