<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/RanFor_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

In [2]:
# Step 1: Download dataset
path = kagglehub.dataset_download("marshalpatel3558/diabetes-prediction-dataset-legit-dataset")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/marshalpatel3558/diabetes-prediction-dataset-legit-dataset?dataset_version_number=1...


100%|██████████| 15.6k/15.6k [00:00<00:00, 22.3MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/marshalpatel3558/diabetes-prediction-dataset-legit-dataset/versions/1





In [3]:
# Step 2: Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [5]:
# Step 3: Load data
file_path = path + "/Dataset of Diabetes .csv"
df = pd.read_csv(file_path)

In [14]:
# Step 4: Preprocessing
df['CLASS'] = df['CLASS'].astype(str).str.strip().str.upper()
print("Unique CLASS values after cleaning:", df['CLASS'].unique())

valid_classes = ['N', 'P', 'Y']
df = df[df['CLASS'].isin(valid_classes)].copy()
print("Dataset shape after filtering:", df.shape)

# Clean CLASS values: strip whitespace and uppercase
df['CLASS'] = df['CLASS'].astype(str).str.strip().str.upper()

# Check again
print("Unique CLASS values after cleaning:", df['CLASS'].unique())

# Filter valid classes
valid_classes = ['N', 'P', 'Y']
df = df[df['CLASS'].isin(valid_classes)].copy()

# Encode Gender
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])

# Encode CLASS
class_mapping = {'N': 0, 'P': 1, 'Y': 2}
df['CLASS'] = df['CLASS'].map(class_mapping)
print("Any NaNs in CLASS after mapping?", df['CLASS'].isna().any())

Unique CLASS values after cleaning: []
Dataset shape after filtering: (0, 14)
Unique CLASS values after cleaning: []
Any NaNs in CLASS after mapping? False


In [15]:
# Step 5: Split data
X = df.drop('CLASS', axis=1)
y = df['CLASS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [8]:
# Step 6: Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

ValueError: Input y contains NaN.

In [None]:
# Step 7: Evaluate Model
y_pred = rf.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
# Step 8: Feature Importance Visualization
importances = rf.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
sns.barplot(x=importances[indices], y=features[indices], palette="viridis")
plt.title("Feature Importances (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()