In [4]:
# ============== FINAL CORRECTED TRAINING SCRIPT ==============

# --- 1. Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pickle
print("Libraries imported successfully.")

# --- 2. Load and Prepare Data (Milestone 2) ---
print("🔹 Loading dataset...")
# CORRECTED FILENAME HERE
data = pd.read_csv('chronickidneydisease.csv') 
# The guided project PDF shows an 'id' column which is not useful for prediction, let's drop it if it exists.
if 'id' in data.columns:
    data = data.drop('id', axis=1)

# --- 3. Rename Columns ---
# Manually rename columns as per the guided project to ensure consistency
data.columns = [
    'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell', 
    'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium', 
    'potassium', 'hemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count', 
    'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'pedal_edema', 'anemia', 'class'
]
print("✅ Columns renamed.")

# --- 4. Correcting Data Types and Errors ---
# These columns are meant to be numeric but contain non-numeric characters like '?'.
# We will convert them to numbers, and any values that can't be converted will become NaN (Not a Number).
data['packed_cell_volume'] = pd.to_numeric(data['packed_cell_volume'], errors='coerce')
data['white_blood_cell_count'] = pd.to_numeric(data['white_blood_cell_count'], errors='coerce')
data['red_blood_cell_count'] = pd.to_numeric(data['red_blood_cell_count'], errors='coerce')

# Correcting specific typos found in the categorical columns of the dataset
data['diabetes_mellitus'].replace(to_replace={'\tno': 'no', '\tyes': 'yes', ' yes': 'yes'}, inplace=True)
data['coronary_artery_disease'] = data['coronary_artery_disease'].replace(to_replace='\tno', value='no')
data['class'] = data['class'].replace(to_replace='ckd\t', value='ckd')
print("✅ Data types and entry errors corrected.")

# --- 5. Robust Missing Value Handling ---
print("🔧 Handling missing values...")
# Separate columns into categorical and numerical lists
cat_cols = [col for col in data.columns if data[col].dtype == 'object']
num_cols = [col for col in data.columns if data[col].dtype != 'object']

# Fill missing values in categorical columns with the mode (most frequent value)
for col in cat_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Fill missing values in numerical columns with the mean (average value)
for col in num_cols:
    data[col].fillna(data[col].mean(), inplace=True)
print("✅ Missing values handled.")

# --- 6. Label Encoding ---
# Convert all categorical text columns (like 'normal'/'abnormal') into numbers (0 or 1)
print("🔹 Applying Label Encoding...")
le = LabelEncoder()
for col in cat_cols:
    data[col] = le.fit_transform(data[col])
print("✅ Label Encoding complete.")

# --- 7. Feature Selection ---
# As per the guided project, a specific subset of features is used for the final model
sel_cols = [
    'red_blood_cells', 'pus_cell', 'blood_glucose_random', 'blood_urea', 'pedal_edema', 'anemia', 
    'diabetes_mellitus', 'coronary_artery_disease'
]
x = data[sel_cols]
y = data['class']
print(f"✅ Features selected.")

# --- 8. Train-Test Split ---
# Split the data into 80% for training and 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
print("✅ Data split into training and testing sets.")

# --- 9. Model Training ---
# Train the Logistic Regression model, which was chosen as the best performer
print("🔹 Training Logistic Regression model...")
lgr = LogisticRegression(max_iter=1000) # Increased max_iter to help with convergence
lgr.fit(x_train, y_train)
print("✅ Model training complete.")

# --- 10. Save the Model ---
# Save the trained model to the main project folder so the Flask app can use it
model_save_path = "../CKD.pkl"
pickle.dump(lgr, open(model_save_path, 'wb'))
print(f"✅ Model saved successfully as '{model_save_path}'")
print("\n🎉 Training pipeline complete!")


Libraries imported successfully.
🔹 Loading dataset...
✅ Columns renamed.
✅ Data types and entry errors corrected.
🔧 Handling missing values...
✅ Missing values handled.
🔹 Applying Label Encoding...
✅ Label Encoding complete.
✅ Features selected.
✅ Data split into training and testing sets.
🔹 Training Logistic Regression model...
✅ Model training complete.
✅ Model saved successfully as '../CKD.pkl'

🎉 Training pipeline complete!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['diabetes_mellitus'].replace(to_replace={'\tno': 'no', '\tyes': 'yes', ' yes': 'yes'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never wor