# Import the Dependencies 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## Understanding The Feature 

-Pregnancies: This attribute represents the number of times a patient has been pregnant. It can be an important predictor of diabetes risk, as women who have had multiple pregnancies may be at higher risk.

-Glucose: This attribute represents the patient's fasting blood glucose level, measured in milligrams per deciliter (mg/dL). It is a key indicator of diabetes risk, as high glucose levels can indicate impaired glucose tolerance or insulin resistance.

-BloodPressure: This attribute represents the patient's systolic blood pressure, measured in millimeters of mercury (mmHg). High blood pressure can be a risk factor for diabetes, as it can damage blood vessels and impair insulin sensitivity.

-SkinThickness: This attribute represents the thickness of the patient's skinfold at the triceps, measured in millimeters. While skin thickness is not directly related to diabetes risk, it can be a useful predictor of insulin resistance.

-Insulin: This attribute represents the patient's serum insulin level, measured in microunits per milliliter (Î¼U/mL). High insulin levels can be a sign of insulin resistance, which is a key risk factor for diabetes.

-BMI: This attribute represents the patient's body mass index, calculated as weight in kilograms divided by height in meters squared. Obesity is a major risk factor for diabetes, and BMI is a useful measure of overall body fatness.

-DiabetesPedigreeFunction: This attribute represents the patient's family history of diabetes, calculated as a function of the patient's relatives who have diabetes. A family history of diabetes can be a strong predictor of diabetes risk.

-Age: This attribute represents the patient's age in years. Age is an important predictor of diabetes risk, as older individuals are generally at higher risk.

-Outcome: This attribute represents whether or not the patient has been diagnosed with diabetes (1 if yes, 0 if no). This is the target variable that the machine learning model will be trained to predict.

In [3]:
df = pd.read_csv('F:\Diabetes Prediction\diabetes.csv')

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.shape

In [6]:
df.info()

In [7]:
df.isnull().sum()

In [8]:
df.duplicated().sum()

In [9]:
df.corr()

In [10]:
df["Pregnancies"].unique()

In [11]:
df.columns

In [12]:
df.describe()

In [13]:
df = df.drop(df[df['Pregnancies'].isin([14,15, 17])].index)


In [14]:
for col in df.columns:
    values_counts=df[col].value_counts()
    print(f'the values counts in {col} is \n{values_counts}')


In [None]:
sns.pairplot(df[df.columns], diag_kind='hist')


In [None]:
df.groupby("Outcome").mean()

In [None]:
def Heat_Map():
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.2f')
    plt.title('Correlation Matrix Heatmap')
    plt.tight_layout()
    plt.show()
Heat_Map()

In [None]:
# Define a function to create a box plot for each column
def plot_outliers_boxplot(data):
    # Create a figure with subplots for each column
    fig, axs = plt.subplots(ncols=len(data.columns), figsize=(20, 5))

    # Create a box plot for each column
    for i, col in enumerate(data.columns):
        sns.boxplot(x=data[col], ax=axs[i])
        axs[i].set_xlabel(col)
    
    plt.tight_layout()
    plt.show()
plot_outliers_boxplot(df)

In [None]:
df1 = df[df.Outcome==1]
df0 = df[df.Outcome==0]


In [None]:
plt.scatter(df1['Glucose'],df1['Insulin'],color='blue',marker='+')
plt.scatter(df0['Glucose'],df0['Insulin'],color='red',marker='.')

In [None]:
x=df.drop(columns='Outcome',axis=1)
y=df['Outcome']

In [None]:
Scaler = StandardScaler()

In [None]:
x_standrize = Scaler.fit_transform(x)

In [None]:
x_train,x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [None]:
X_test.shape

In [None]:
clf = svm.SVC(kernel='linear')

In [None]:
clf.fit(X_train , y_train)

In [None]:
X_train_acc = clf.predict(X_train)
training_acc = accuracy_score(X_train_acc,y_train)
training_acc

In [None]:
y_predict = clf.predict(X_test)

In [None]:
print(classification_report(y_test,y_predict))


In [None]:
input_data = (11,138,74,26,144,36.1,0.557,50)
input_data_array = np.asarray(input_data)
input_data_array_reshape = input_data_array.reshape(1,-1)
prediction = clf.predict(input_data_array_reshape)
print(prediction)

# Saving Model

In [None]:
saving_model = pickle.dump(clf,open('diabetes.sav','wb'))

In [None]:
loadind_model = pickle.load(open('diabetes.sav','rb'))

In [None]:
input_data = (5,116,74,0,0,25.6,0.201,30)
input_data_array = np.asarray(input_data)
input_data_array_reshape = input_data_array.reshape(1,-1)
prediction = loadind_model.predict(input_data_array_reshape)
if(prediction[0]==0):
    print("this is a non-diabetic person")
else:
    print("this is a diabetic person")