<a href="https://colab.research.google.com/github/aroy2025/Projects/blob/main/arijit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.impute import KNNImputer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pd.read_csv

In [None]:
diabetes_dataset=pd.read_csv('/content/drive/MyDrive/data.csv')

In [None]:
#apply zeroes with nan values
cols_missing_val = ['Glucose','BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction','Age']
diabetes_dataset[cols_missing_val] = diabetes_dataset[cols_missing_val].replace(0, np.nan)

In [None]:
for column in cols_missing_val:
  for label in diabetes_dataset['Outcome'].unique():
    mean_value = diabetes_dataset[diabetes_dataset['Outcome'] == label][column].mean()
    diabetes_dataset.loc[(diabetes_dataset['Outcome'] == label) & (diabetes_dataset[column].isnull()), column] = mean_value

  diabetes_dataset.loc[(diabetes_dataset['Outcome'] == label) & (diabetes_dataset[column].isnull()), column] = mean_value
  diabetes_dataset.loc[(diabetes_dataset['Outcome'] == label) & (diabetes_dataset[column].isnull()), column] = mean_value
  diabetes_dataset.loc[(diabetes_dataset['Outcome'] == label) & (diabetes_dataset[column].isnull()), column] = mean_value


In [None]:
#separate the dataset by class
class_0 = diabetes_dataset[diabetes_dataset['Outcome'] == 0]
class_1 = diabetes_dataset[diabetes_dataset['Outcome'] == 1]

In [None]:
# impute missing values with mean for each class
# Keep the 'Outcome' column for now
data_class_0 = class_0.copy()
data_class_1 = class_1.copy()

In [None]:
# combine the datasets back
# The Outcome column is preserved from the original separation
diabetes_dataset = pd.concat([data_class_0, data_class_1]).sort_index()

In [None]:
# reset index if needed
diabetes_dataset.reset_index(drop=True, inplace=True)

In [None]:
print(diabetes_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            4     83.0           88.0           25.0    156.0  20.6   
1            1    124.0           59.0            3.0     39.0  34.1   
2            3    119.0           73.0           27.0    140.0  36.3   
3            3    116.0           64.0           25.0      7.0  23.4   
4            1    116.0           70.0           36.0    149.0  28.3   

   DiabetesPedigreeFunction   Age  Gender FamilyHistory Smoking  \
0                      0.56  47.0  Female           Yes      No   
1                      0.60  58.0    Male            No      No   
2                      0.71  50.0    Male            No     Yes   
3                      0.49  67.0  Female            No      No   
4                      0.80  56.0  Female           Yes     Yes   

  PhysicalActivity  SleepHours  HbA1c  
0              Low         5.0    6.2  
1           Medium         6.4    8.3  
2             High         8.4    6.2  
3   

In [None]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(5000, 14)

In [None]:
# statistical measures
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,SleepHours,HbA1c
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,1.9792,120.017,69.6578,19.660004,98.994786,29.8676,0.505444,48.4664,6.53502,6.01092
std,1.391242,30.220393,12.035318,9.528948,49.86018,7.147856,0.294807,17.768951,1.510115,1.188555
min,0.0,10.0,23.0,-16.0,-76.0,-1.3,-0.53,18.0,0.3,1.5
25%,1.0,100.0,62.0,13.0,65.0,24.9,0.3,33.0,5.5,5.2
50%,2.0,120.0,70.0,20.0,98.994786,29.9,0.51,49.0,6.5,6.0
75%,3.0,141.0,78.0,26.0,134.0,34.7,0.71,64.0,7.6,6.8
max,8.0,254.0,117.0,52.0,284.0,56.1,1.56,79.0,11.7,10.2


In [None]:
diabetes_dataset['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
1,2817
0,2183


In [None]:
# Calculate the mean for numeric columns, grouped by 'Outcome'
display(diabetes_dataset.groupby('Outcome').mean(numeric_only=True))

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,SleepHours,HbA1c
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.963811,111.073294,69.761338,19.43497,99.394855,26.922446,0.505845,48.721942,6.525103,5.587998
1,1.991125,126.947817,69.577565,19.834391,98.684758,32.149911,0.505132,48.268371,6.542705,6.338658


In [None]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [None]:
print(X)

      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0               4     83.0           88.0           25.0    156.0  20.6   
1               1    124.0           59.0            3.0     39.0  34.1   
2               3    119.0           73.0           27.0    140.0  36.3   
3               3    116.0           64.0           25.0      7.0  23.4   
4               1    116.0           70.0           36.0    149.0  28.3   
...           ...      ...            ...            ...      ...   ...   
4995            3     94.0           75.0           18.0    151.0  35.6   
4996            0    132.0           79.0           17.0    157.0  24.8   
4997            4    120.0           57.0           17.0     31.0  24.3   
4998            1    110.0           92.0           10.0    130.0  23.7   
4999            0     92.0           75.0           34.0    140.0  38.1   

      DiabetesPedigreeFunction   Age  Gender FamilyHistory Smoking  \
0                         0.5

In [None]:
print(Y)

0       0
1       1
2       0
3       0
4       0
       ..
4995    1
4996    0
4997    0
4998    0
4999    1
Name: Outcome, Length: 5000, dtype: int64


In [None]:
scaler = StandardScaler()

In [None]:
# Select only the numeric columns for scaling
X_numeric = X.select_dtypes(include=np.number)

# Fit the scaler on the numeric data
scaler.fit(X_numeric)

In [None]:
# Select only the numeric columns for transformation
X_numeric_scaled = scaler.transform(X.select_dtypes(include=np.number))

# If you need to combine scaled numeric and original non-numeric columns, you would do that here.
# For now, we will just display the scaled numeric data.
display(X_numeric_scaled)

array([[ 1.45266030e+00, -1.22502384e+00,  1.52418367e+00, ...,
        -8.25342287e-02, -1.01659370e+00,  1.59099824e-01],
       [-7.03901904e-01,  1.31811599e-01, -8.85632297e-01, ...,
         5.36585054e-01, -8.94193442e-02,  1.92612769e+00],
       [ 7.33806233e-01, -3.36561377e-02,  2.77727135e-01, ...,
         8.63164847e-02,  1.23511545e+00,  1.59099824e-01],
       ...,
       [ 1.45266030e+00, -5.62590306e-04, -1.05182650e+00, ...,
        -1.03935494e+00, -1.01659370e+00,  9.16397482e-01],
       [-7.03901904e-01, -3.31498064e-01,  1.85657208e+00, ...,
        -1.65847422e+00, -1.94376806e+00,  4.11532376e-01],
       [-1.42275597e+00, -9.27181918e-01,  4.43921340e-01, ...,
         8.74286480e-01,  1.09260875e-01, -7.66486203e-01]])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(5000, 14) (4000, 14) (1000, 14)


In [None]:
classifier = RandomForestClassifier(random_state=42)

# Task
Explain the error in the selected code. If possible, fix the error and incorporate the changes into the existing code. Otherwise, try to diagnose the error.

## Handle categorical features

### Subtask:
Convert the non-numeric columns in the dataset to a numerical format using one-hot encoding.


**Reasoning**:
Identify and one-hot encode the non-numeric columns in the DataFrame X.



In [None]:
X_non_numeric = X.select_dtypes(exclude=np.number)
X_encoded = pd.get_dummies(X, columns=X_non_numeric.columns)
display(X_encoded.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,SleepHours,HbA1c,Gender_Female,Gender_Male,FamilyHistory_No,FamilyHistory_Yes,Smoking_No,Smoking_Yes,PhysicalActivity_High,PhysicalActivity_Low,PhysicalActivity_Medium
0,4,83.0,88.0,25.0,156.0,20.6,0.56,47.0,5.0,6.2,True,False,False,True,True,False,False,True,False
1,1,124.0,59.0,3.0,39.0,34.1,0.6,58.0,6.4,8.3,False,True,True,False,True,False,False,False,True
2,3,119.0,73.0,27.0,140.0,36.3,0.71,50.0,8.4,6.2,False,True,True,False,False,True,True,False,False
3,3,116.0,64.0,25.0,7.0,23.4,0.49,67.0,6.9,6.2,True,False,True,False,True,False,False,False,True
4,1,116.0,70.0,36.0,149.0,28.3,0.8,56.0,2.7,6.0,True,False,False,True,False,True,False,False,True


## Combine features

### Subtask:
Combine the scaled numeric features with the one-hot encoded categorical features to create the final feature set.


**Reasoning**:
Separate numeric and non-numeric columns, scale the numeric columns, one-hot encode the non-numeric columns, and then concatenate them to create the final feature set.



In [None]:
# Separate numeric columns
X_numeric = X.select_dtypes(include=np.number)

# Scale the numeric columns
X_numeric_scaled = scaler.transform(X_numeric)
X_numeric_scaled = pd.DataFrame(X_numeric_scaled, columns=X_numeric.columns)

# Separate non-numeric columns
X_non_numeric = X.select_dtypes(exclude=np.number)

# Apply one-hot encoding
X_non_numeric_encoded = pd.get_dummies(X_non_numeric)

# Concatenate the scaled numeric and encoded non-numeric columns
X_processed = pd.concat([X_numeric_scaled, X_non_numeric_encoded], axis=1)

# Display the head of the final feature set
display(X_processed.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,SleepHours,HbA1c,Gender_Female,Gender_Male,FamilyHistory_No,FamilyHistory_Yes,Smoking_No,Smoking_Yes,PhysicalActivity_High,PhysicalActivity_Low,PhysicalActivity_Medium
0,1.45266,-1.225024,1.524184,0.560453,1.143416,-1.296686,0.185076,-0.082534,-1.016594,0.1591,True,False,False,True,True,False,False,True,False
1,-0.703902,0.131812,-0.885632,-1.748532,-1.203381,0.592181,0.320772,0.536585,-0.089419,1.926128,False,True,True,False,True,False,False,False,True
2,0.733806,-0.033656,0.277727,0.770361,0.822486,0.899996,0.693934,0.086316,1.235115,0.1591,False,True,True,False,False,True,True,False,False
3,0.733806,-0.132937,-0.470147,0.560453,-1.84524,-0.904921,-0.052391,1.043137,0.241714,0.1591,True,False,True,False,True,False,False,False,True
4,-0.703902,-0.132937,0.028436,1.714946,1.003009,-0.219332,0.999248,0.424018,-2.539809,-0.009189,True,False,False,True,False,True,False,False,True


## Split data

### Subtask:
Split the combined feature set into training and testing sets.


**Reasoning**:
Split the processed feature set and the target variable into training and testing sets.



In [None]:
X_train_processed, X_test_processed, Y_train, Y_test = train_test_split(X_processed, Y, test_size=0.2, stratify=Y, random_state=2)
print(X_train_processed.shape, X_test_processed.shape, Y_train.shape, Y_test.shape)

(4000, 19) (1000, 19) (4000,) (1000,)


## Train model

### Subtask:
Train the RandomForestClassifier model using the preprocessed training data.


**Reasoning**:
The previous step failed because the input data to the classifier contained string values. The data has been preprocessed to handle categorical features and the preprocessed data `X_train_processed` is available. Now, train the RandomForestClassifier model using the preprocessed training data.



In [None]:
classifier.fit(X_train_processed, Y_train)

In [None]:
# create pipeline
pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    RandomForestClassifier(random_state=42)
)

In [None]:
# Run 5 fold cross validation
scores = cross_val_score(pipeline, X_train_processed, Y_train, cv=5)

In [None]:
print("Cross-validated Accuracy Scores :", scores)

Cross-validated Accuracy Scores : [1. 1. 1. 1. 1.]


**Reasoning**:
Separate numeric and non-numeric columns, scale the numeric columns, one-hot encode the non-numeric columns, and then concatenate them to create the final feature set.

In [None]:
# Separate numeric columns
X_numeric = X.select_dtypes(include=np.number)

# Scale the numeric columns
X_numeric_scaled = scaler.transform(X_numeric)
X_numeric_scaled = pd.DataFrame(X_numeric_scaled, columns=X_numeric.columns)

# Separate non-numeric columns
X_non_numeric = X.select_dtypes(exclude=np.number)

# Apply one-hot encoding
X_non_numeric_encoded = pd.get_dummies(X_non_numeric)

# Concatenate the scaled numeric and encoded non-numeric columns
X_processed = pd.concat([X_numeric_scaled, X_non_numeric_encoded], axis=1)

# Display the head of the final feature set
display(X_processed.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,SleepHours,HbA1c,Gender_Female,Gender_Male,FamilyHistory_No,FamilyHistory_Yes,Smoking_No,Smoking_Yes,PhysicalActivity_High,PhysicalActivity_Low,PhysicalActivity_Medium
0,1.45266,-1.225024,1.524184,0.560453,1.143416,-1.296686,0.185076,-0.082534,-1.016594,0.1591,True,False,False,True,True,False,False,True,False
1,-0.703902,0.131812,-0.885632,-1.748532,-1.203381,0.592181,0.320772,0.536585,-0.089419,1.926128,False,True,True,False,True,False,False,False,True
2,0.733806,-0.033656,0.277727,0.770361,0.822486,0.899996,0.693934,0.086316,1.235115,0.1591,False,True,True,False,False,True,True,False,False
3,0.733806,-0.132937,-0.470147,0.560453,-1.84524,-0.904921,-0.052391,1.043137,0.241714,0.1591,True,False,True,False,True,False,False,False,True
4,-0.703902,-0.132937,0.028436,1.714946,1.003009,-0.219332,0.999248,0.424018,-2.539809,-0.009189,True,False,False,True,False,True,False,False,True


**Reasoning**:
Split the processed feature set and the target variable into training and testing sets.

In [None]:
X_train_processed, X_test_processed, Y_train, Y_test = train_test_split(X_processed, Y, test_size=0.2, stratify=Y, random_state=2)
print(X_train_processed.shape, X_test_processed.shape, Y_train.shape, Y_test.shape)

(4000, 19) (1000, 19) (4000,) (1000,)


In [None]:
classifier.fit(X_train_processed, Y_train)

**Reasoning**:
Evaluate the performance of the trained model on the unseen test data (`X_test_processed`) using metrics like accuracy, precision, recall, F1-score, and the confusion matrix.

In [None]:
# Make predictions on the test data
Y_pred = classifier.predict(X_test_processed)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
conf_matrix = confusion_matrix(Y_test, Y_pred)
class_report = classification_report(Y_test, Y_pred)

# Display the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

Confusion Matrix:
[[437   0]
 [  0 563]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       437
           1       1.00      1.00      1.00       563

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [None]:


# Function to take user input and predict
def predict_diabetic_status(model, scaler, numeric_cols, categorical_cols, trained_columns):
  """
  Takes user input for features, preprocesses it, and predicts if the person is diabetic or not.

  Args:
    model: The trained machine learning model.
    scaler: The fitted StandardScaler object.
    numeric_cols: A list of original numeric column names.
    categorical_cols: A list of original categorical column names.
    trained_columns: A list of the final column names the model was trained on (from X_processed).

  Returns:
    A string indicating whether the person is predicted to be diabetic or not, or an error message.
  """
  user_data = {}
  print("Please enter the following information:")

  # Collect input for all original columns
  all_original_cols = numeric_cols + categorical_cols
  for col in all_original_cols:
    if col in numeric_cols:
      while True:
        try:
          value = float(input(f"Enter value for {col} (numeric): "))
          user_data[col] = value
          break # Exit the loop if input is valid
        except ValueError:
          print(f"Invalid input for {col}. Please enter a numeric value.")
    elif col in categorical_cols:
       user_data[col] = input(f"Enter value for {col} (e.g., Female, Male, Yes, No, High, Medium, Low): ")


  # Create a DataFrame from user input
  user_df = pd.DataFrame([user_data])

  # --- Preprocessing the user input to match the training data format ---

  # 1. Separate numeric and categorical input
  user_numeric = user_df[numeric_cols]
  user_categorical = user_df[categorical_cols]

  # 2. Scale the numeric input using the fitted scaler
  # Handle potential NaN values in user input before scaling if necessary
  # Although we are prompting for numeric, if user enters non-numeric before fix, it will be NaN
  # For robustness, we can impute or handle NaNs here if needed, but for this example, assuming valid numeric input for numeric cols
  user_numeric_scaled = scaler.transform(user_numeric)
  user_numeric_scaled_df = pd.DataFrame(user_numeric_scaled, columns=numeric_cols)

  # 3. One-hot encode the categorical input
  # Use the same logic as applied to the training data
  user_categorical_encoded = pd.get_dummies(user_categorical)

  # 4. Ensure the encoded categorical columns match the training data's encoded columns
  # This is crucial if the user input doesn't contain all possible categories seen during training
  # Reindex with the columns from X_processed to ensure all expected columns are present, filling missing with 0
  # Select only the one-hot encoded columns from the trained_columns
  trained_categorical_cols = [col for col in trained_columns if col not in numeric_cols]
  user_categorical_encoded = user_categorical_encoded.reindex(columns=trained_categorical_cols, fill_value=0)


  # 5. Concatenate the scaled numeric and encoded categorical data
  user_processed = pd.concat([user_numeric_scaled_df, user_categorical_encoded], axis=1)

  # Ensure the order of columns in user_processed matches trained_columns
  user_processed = user_processed[trained_columns]

  # --- End of Preprocessing ---


  # Predict using the trained model
  try:
    prediction = model.predict(user_processed)

    if prediction[0] == 1:
      return "The person is predicted to be diabetic."
    else:
      return "The person is predicted to be non-diabetic."
  except Exception as e:
    return f"Prediction failed: {e}"


# Define the original numeric and categorical column names
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# Get the column names from the preprocessed training data (X_processed)
processed_column_names = X_processed.columns.tolist()

# Get user input and make a prediction
# Pass the scaler fitted on the training data, original column names, and trained model column names
prediction_result = predict_diabetic_status(classifier, scaler, numeric_features, categorical_features, processed_column_names)
print(prediction_result)

Please enter the following information:
Enter value for Pregnancies (numeric): 55
Enter value for Glucose (numeric): 55
