**Task-3 : DIABETES PREDICTION**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('/content/diabetes_prediction_dataset.csv')
print(df.head())

X = df[['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level']]  # Features
y = df['diabetes']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']),
        ('cat', OneHotEncoder(drop='first'), ['gender', 'smoking_history'])
    ])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = {
    'Random Forest Classifier': Pipeline(steps=[('preprocessor', preprocessor),
                                                ('classifier', RandomForestClassifier())]),
    'K-Nearest Neighbors': Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', KNeighborsClassifier())]),
    'AdaBoost Classifier': Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', AdaBoostClassifier())]),
    'Logistic Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', LogisticRegression(max_iter=1000))]),
    'Decision Tree Classifier': Pipeline(steps=[('preprocessor', preprocessor),
                                                ('classifier', DecisionTreeClassifier())])
}

#Train the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy:.2f}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\n')
    print(f'Classification Report:\n{classification_report(y_test, y_pred)}\n')

def predict_diabetes(input_data, model):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    prediction = model.predict(input_df)
    return 'Diabetes' if prediction[0] == 1 else 'No Diabetes'

#Example
example_input = ['Male',67, 0, 1, 'not current', 27.32, 6.5, 200]
best_model = models['Random Forest Classifier']
result = predict_diabetes(example_input, best_model)
print(f'The prediction for the example input is: {result}')


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
Random Forest Classifier Accuracy: 0.97
Confusion Matrix:
[[18224    68]
 [  528  1180]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.95      0.69      0.80      1708

    accu