In [467]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import statistics

In [497]:
dataset = pd.read_csv('diabetes.csv')
dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [498]:
#statistics.median(student_ages)
print(statistics.median(dataset[["Glucose"]].values),
statistics.median(dataset[["BloodPressure"]].values),
statistics.median(dataset[["SkinThickness"]].values),
statistics.median(dataset[["Insulin"]].values),
statistics.median(dataset[["BMI"]].values))


[117.] [72.] [23.] [30.5] [32.]


# Replace 0 values with median

In [528]:
dataset.loc[dataset.Glucose == 0,'Glucose'] = 117
dataset.loc[dataset.BloodPressure == 0,'BloodPressure'] = 72 
dataset.loc[dataset.SkinThickness == 0,'SkinThickness'] = 23
dataset.loc[dataset.Insulin == 0,'Insulin'] = 30
dataset.loc[dataset.BMI == 0,'BMI'] = 32


In [529]:
dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,30,33.6,0.627,50,1
1,1,85,66,29,30,26.6,0.351,31,0
2,8,183,64,23,30,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,30,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,23,30,30.1,0.349,47,1


In [530]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [531]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [532]:
log = LogisticRegression(random_state = 0)
log.fit(X_train, y_train)

DT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DT.fit(X_train, y_train)

RF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RF.fit(X_train, y_train)

SV = SVC(kernel = 'linear', random_state = 0)
SV.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVC(kernel='linear', random_state=0)

In [533]:
y_log = log.predict(X_test)
y_DT = DT.predict(X_test)
y_RF = RF.predict(X_test)
y_SV = SV.predict(X_test)

print("LOG=",accuracy_score(y_test, y_log))
print("DT=",accuracy_score(y_test, y_DT))
print("rf=",accuracy_score(y_test, y_RF))
print("sv=",accuracy_score(y_test, y_SV))

LOG= 0.8116883116883117
DT= 0.7467532467532467
rf= 0.7727272727272727
sv= 0.8051948051948052


In [534]:
cm1 = confusion_matrix(y_test, y_log)
cm2 = confusion_matrix(y_test, y_DT)
cm3 = confusion_matrix(y_test, y_RF)
cm4 = confusion_matrix(y_test, y_SV)

print("CM1= ",cm1)
print("CM2= ",cm2)
print("CM3= ",cm3)
print("CM4= ",cm4)


CM1=  [[98  9]
 [20 27]]
CM2=  [[81 26]
 [13 34]]
CM3=  [[90 17]
 [18 29]]
CM4=  [[96 11]
 [19 28]]


# select best 4 features

In [535]:
d = dataset[["Glucose","BloodPressure", "BMI","Age","Outcome"]]

In [536]:
d

Unnamed: 0,Glucose,BloodPressure,BMI,Age,Outcome
0,148,72,33.6,50,1
1,85,66,26.6,31,0
2,183,64,23.3,32,1
3,89,66,28.1,21,0
4,137,40,43.1,33,1
...,...,...,...,...,...
763,101,76,32.9,63,0
764,122,70,36.8,27,0
765,121,72,26.2,30,0
766,126,60,30.1,47,1


# replacing the values in each feature to 5 categories

In [537]:
d.loc[d.Age <= 30,'Age'] = 1
d.loc[(d.Age >30) & (d.Age <= 40),'Age'] = 2
d.loc[(d.Age >40) & (d.Age <= 50),'Age'] = 3
d.loc[(d.Age >50) & (d.Age <= 60),'Age'] = 4
d.loc[d.Age > 60,'Age'] = 5

d.loc[d.Glucose <= 60,'Glucose'] = 1
d.loc[(d.Glucose >60) & (d.Glucose <= 80),'Glucose'] = 2
d.loc[(d.Glucose >80) & (d.Glucose <= 140),'Glucose'] = 3
d.loc[(d.Glucose >140) & (d.Glucose <= 180),'Glucose'] = 4
d.loc[d.Glucose > 180,'Glucose'] = 5

d.loc[d.BloodPressure <= 60,'BloodPressure'] = 1
d.loc[(d.BloodPressure >60) & (d.BloodPressure <= 75),'BloodPressure'] = 2
d.loc[(d.BloodPressure >75) & (d.BloodPressure <= 90),'BloodPressure'] = 3
d.loc[(d.BloodPressure >90) & (d.BloodPressure <= 100),'BloodPressure'] = 4
d.loc[d.BloodPressure > 100,'BloodPressure'] = 5

d.loc[d.BMI < 19,'BMI'] = 1
d.loc[(d.BMI >= 19) & (d.BMI <= 24),'BMI'] = 2
d.loc[(d.BMI >24) & (d.BMI <= 30),'BMI'] = 3
d.loc[(d.BMI >30) & (d.BMI <= 40),'BMI'] = 4
d.loc[d.BMI > 40,'BMI'] = 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

In [538]:
d

Unnamed: 0,Glucose,BloodPressure,BMI,Age,Outcome
0,4,2,4.0,3,1
1,3,2,3.0,2,0
2,5,2,2.0,2,1
3,3,2,3.0,1,0
4,3,1,5.0,2,1
...,...,...,...,...,...
763,3,3,4.0,5,0
764,3,2,4.0,1,0
765,3,2,3.0,1,0
766,3,1,4.0,3,1


In [539]:
X = d.iloc[:, :-1].values
y = d.iloc[:, -1].values

In [540]:
X

array([[4., 2., 4., 3.],
       [3., 2., 3., 2.],
       [5., 2., 2., 2.],
       ...,
       [3., 2., 3., 1.],
       [3., 1., 4., 3.],
       [3., 2., 4., 1.]])

In [541]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [542]:
X_train

array([[4., 3., 4., 4.],
       [3., 1., 3., 1.],
       [4., 3., 5., 1.],
       ...,
       [3., 2., 3., 1.],
       [3., 2., 4., 2.],
       [3., 3., 4., 5.]])

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [544]:
log2 = LogisticRegression(random_state = 0)
log2.fit(X_train, y_train)

DT2 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DT2.fit(X_train, y_train)

RF2 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RF2.fit(X_train, y_train)

SV2 = SVC(kernel = 'linear', random_state = 0)
SV2.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [545]:
y_log2 = log2.predict(X_test)
y_DT2 = DT2.predict(X_test)
y_RF2 = RF2.predict(X_test)
y_SV2 = SV2.predict(X_test)

print("LOG=",accuracy_score(y_test, y_log2))
print("DT=",accuracy_score(y_test, y_DT2))
print("rf=",accuracy_score(y_test, y_RF2))
print("sv=",accuracy_score(y_test, y_SV2))

LOG= 0.7987012987012987
DT= 0.7987012987012987
rf= 0.8116883116883117
sv= 0.7857142857142857


In [546]:
cm_1 = confusion_matrix(y_test, y_log2)
cm_2 = confusion_matrix(y_test, y_DT2)
cm_3 = confusion_matrix(y_test, y_RF2)
cm_4 = confusion_matrix(y_test, y_SV2)

print("CM1=",cm_1)
print("CM2=",cm_2)
print("CM3=",cm_3)
print("CM4=",cm_4)


CM1= [[96 11]
 [20 27]]
CM2= [[100   7]
 [ 24  23]]
CM3= [[96 11]
 [18 29]]
CM4= [[96 11]
 [22 25]]
