In [1]:
# diabetes_prediction_model.ipynb

# Import Dependencies
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle
import joblib

# Data Collection and Analysis
diabetes_dataset = pd.read_csv("diabetes.csv")  # Make sure you have the dataset

print("First 5 rows:")
print(diabetes_dataset.head())

print("\nDataset shape:", diabetes_dataset.shape)

print("\nStatistical summary:")
print(diabetes_dataset.describe())

print("\nOutcome distribution:")
print(diabetes_dataset['Outcome'].value_counts())

print("\nGrouped means:")
print(diabetes_dataset.groupby('Outcome').mean())

# Separating the data and labels
X = diabetes_dataset.drop(columns='Outcome', axis=1)
y = diabetes_dataset['Outcome']

# Standardize the data
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)

X = standardized_data
y = diabetes_dataset['Outcome']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

print(f"\nData shapes - X: {X.shape}, X_train: {X_train.shape}, X_test: {X_test.shape}")

# Training the Model
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, y_train)

# Accuracy score on training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print(f"Training Data Accuracy: {training_data_accuracy}")

# Accuracy score on test data
X_test_prediction = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, y_test)
print(f"Test Data Accuracy: {testing_data_accuracy}")

# Save the model and scaler
joblib.dump(classifier, 'diabetes_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Model and scaler saved successfully!")

# Test prediction
input_data = (1, 89, 66, 23, 94, 28.1, 0.167, 21)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
std_data = scaler.transform(input_data_reshaped)
prediction = classifier.predict(std_data)

if prediction[0] == 0:
    print("The patient is not diabetic")
else:
    print("The patient is diabetic")

First 5 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset shape: (768, 9)

Statistical summary:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.36

