In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [None]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('/content/Dataset.csv')

In [None]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,80.0,0,1,25.19,6.6,140,0
1,54.0,0,0,27.32,6.6,80,0
2,28.0,0,0,27.32,5.7,158,0
3,36.0,0,0,23.45,5.0,155,0
4,76.0,1,1,20.14,4.8,155,0


In [None]:
# number of rows and Columns in this dataset
diabetes_dataset.shape


(100000, 7)

In [None]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
diabetes_dataset['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [None]:
print(diabetes_dataset.dtypes)


age                    float64
hypertension             int64
heart_disease            int64
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object


In [None]:
numeric_columns = diabetes_dataset.select_dtypes(include=['number']).columns
diabetes_dataset.groupby('diabetes')[numeric_columns].mean()

Unnamed: 0_level_0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,40.115187,0.058984,0.029235,26.887163,5.396761,132.85247,0.0
1,60.946588,0.245647,0.149059,31.988382,6.934953,194.094706,1.0


In [None]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'diabetes', axis=1)
Y = diabetes_dataset['diabetes']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(100000, 6) (80000, 6) (20000, 6)


In [None]:
classifier = svm.SVC(kernel='linear')


In [None]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [None]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9595125


In [None]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

In [None]:
input_data = (80,0,1,25.19,6.6,140)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

In [None]:
import pickle


In [None]:
filename = 'trained_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
# loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [None]:
input_data = (80,0,1,25.19,6.6,140)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')