In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler


In [6]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 
           'BMI', 'DiabetesPedigree', 'Age', 'Outcome']

# Read the CSV file and skip the first 8 rows, then manually set the columns
df = pd.read_csv('data/pima-indians-diabetes.csv', skiprows=9, names=columns)

# Check the first few rows of the dataframe to ensure it's loaded correctly
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigree  Age  Outcome  
0             0.627   50        1  
1             0.351   31        0  
2             0.672   32        1  
3             0.167   21        0  
4             2.288   33        1  


In [7]:
df = df[pd.to_numeric(df['Pregnancies'], errors='coerce').notnull()]

# Reset index after dropping the rows
df.reset_index(drop=True, inplace=True)

# Check the cleaned dataframe
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigree  Age  Outcome  
0             0.627   50        1  
1             0.351   31        0  
2             0.672   32        1  
3             0.167   21        0  
4             2.288   33        1  


In [8]:
# Features (X) - All columns except the last one (Outcome)
X = df.iloc[:, :-1]

# Target (y) - The last column (Outcome)
y = df.iloc[:, -1]

# Check the first few rows of features and target
print(X.head())
print(y.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigree  Age  
0             0.627   50  
1             0.351   31  
2             0.672   32  
3             0.167   21  
4             2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [9]:
print(X.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigree  Age  
0             0.627   50  
1             0.351   31  
2             0.672   32  
3             0.167   21  
4             2.288   33  


In [10]:
print(y.head())

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64
