In [2]:
import numpy as np
import pandas as pd

# to standardize the data to a common range
from sklearn.preprocessing import StandardScaler

# to split the data into training and test data
from sklearn.model_selection import train_test_split

# Support Vector Machine Model
from sklearn import svm

# Accuracy score
from sklearn.metrics import accuracy_score

In [3]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv') 

# printing the first 5 rows of the dataset, 
# outcome is the label, 1 for diabetic, 0 for non-diabetic
diabetes_dataset.head()

In [4]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

In [5]:
# getting the statistical measures of the data, 
# std : standard deviation, x% = y, mean x percent of the total count is less than y
diabetes_dataset.describe()

In [6]:
# To know count of 1 and 0 i.e dibetic and non-diabteic people
diabetes_dataset['Outcome'].value_counts()

In [7]:
# This will give the group mean value of respective distinct in Outcome, here 1 and 0
# Mean for diabetic and non-diabetic seperately
diabetes_dataset.groupby('Outcome').mean()

In [8]:
# separating the data and labels, axis = 1 for column, axis = 0 for a row
X = diabetes_dataset.drop(columns = 'Outcome', axis = 1)
Y = diabetes_dataset['Outcome']

In [9]:
print(X)

In [10]:
print(Y)

In [11]:
# Data standardization, as we can see, preganancies is ranging between 1 .. 5 .. something,
# but glucose is ranging between 100 .. 150 level, blood pressure 60 .. 70 level, BMI 20 .. 25 level
# Hence, we standardize our data to a particular range for better prediction

# taking an instance of StandardScaler
scaler = StandardScaler()

# fitting our inconsistent data into scalar, scalar.fit_transform() can do in single step, but we are doing separately
scaler.fit(X)

# tranforming our data to a particular range, here between 0 and 1
standardized_data = scaler.transform(X) 

print(standardized_data)

In [12]:
# Features
X = standardized_data

# Label
Y = diabetes_dataset['Outcome']

In [13]:
# test_size = how much percent of data for testing, 0.2 means 20 percent
# Stratify: if we dont include it, it may happen, that only diabetic goes to training, and non-diabetic in the testing
# dataset, we want equal proprotion in testing and training, that is why, we stratify according to label here
# random_state, giving the serial number of the way for splitting data, if two user gives same random_state, their data
# splitting will be same, if they give different, their data splitting will be different
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [14]:
print(X.shape, X_train.shape, X_test.shape)

In [19]:
# Training the Model, svm : support vector machine, svc : support vector classifier, kernel = linear for linear model
classifier = svm.SVC(kernel='linear')

In [20]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [22]:
# Model Evaluation : Accuracy Score, greater than 75 is good because we have small amount of data, 
# because it can be optimized for greater accuracy

# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

In [23]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

In [None]:
# above we observed, the accuracy score on the training data is 78 and on the test data is 77
# it is good, because the model is not overtrained, because overtraining on the training data and resulting in 
# low accuracy on the test data is bad : overfitting

In [None]:
# Making a Predictive System

input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

# prediction is list containing one value
if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')