# Diabetes risk estimation

# Importing libraries and dataset

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

df = pd.read_csv('C:/Users/USER/Desktop/dataset/diabetes/diabetes-cumulative.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,yes
1,1,85,66,29,0,26.6,0.351,31,no
2,8,183,64,0,0,23.3,0.672,32,yes
3,1,89,66,23,94,28.1,0.167,21,no
4,0,137,40,35,168,43.1,2.288,33,yes
...,...,...,...,...,...,...,...,...,...
2763,10,101,76,48,180,32.9,0.171,63,no
2764,2,122,70,27,0,36.8,0.340,27,no
2765,5,121,72,23,112,26.2,0.245,30,no
2766,1,126,60,0,0,30.1,0.349,47,yes


# Preprocessing

In [8]:
outcome_counts = df['Outcome'].value_counts()
outcome_counts

no     1715
yes    1053
Name: Outcome, dtype: int64

In [9]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [10]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [11]:
#YES~1 , NO~0
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
a = ['Outcome']
df[a] = df[a].apply(le.fit_transform)
df[a]

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
2763,0
2764,0
2765,0
2766,1


# Training and Evaluating the Machine Learning Model for Diabetes Prediction

In [13]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# تقسیم داده ها به داده های آموزشی و تست
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# استانداردسازی داده ها
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# استفاده از SMOTE برای افزایش نمونه های کلاس اقلیت
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

#یا

# استفاده از RandomUnderSampler برای کاهش نمونه های کلاس اکثریت
#rus = RandomUnderSampler(random_state=42)
#X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# انتخاب مدل (در اینجا جنگل تصادفی)
model = RandomForestClassifier(n_estimators=100, random_state=42)

# آموزش مدل
model.fit(X_train_resampled, y_train_resampled)

# پیشبینی با استفاده از داده های تست
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# ارزیابی مدل بر روی داده های آموزشی
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f'Accuracy on training data: {accuracy_train:.3f}')

# ارزیابی مدل بر روی داده های تست
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f'Accuracy on test data: {accuracy_test:.3f}')

# نمایش ماتریس اغتشاش و گزارش طبقه بندی برای داده های تست
cm = confusion_matrix(y_test, y_pred_test)
report = classification_report(y_test, y_pred_test)

print('Confusion Matrix:')
print(cm)
print('Classification Report:')
print(report)
# استفاده از Cross-Validation برای بررسی تعمیم‌یافتگی مدل
scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Mean cross-validation score: {scores.mean():.3f}')

Accuracy on training data: 1.000
Accuracy on test data: 0.969
Confusion Matrix:
[[338   5]
 [ 12 199]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       343
           1       0.98      0.94      0.96       211

    accuracy                           0.97       554
   macro avg       0.97      0.96      0.97       554
weighted avg       0.97      0.97      0.97       554

Cross-validation scores: [0.99638989 1.         1.         1.         0.88788427]
Mean cross-validation score: 0.977
