## Explanation

**NONE OF THE CODE IS GRADED**

In [None]:
# pip install pip install -r requirements.txt 

# Step a: Importing Libraries

import pandas as pd              # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization

from sklearn.ensemble import RandomForestClassifier  # For feature importance analysis
from sklearn.model_selection import train_test_split # For splitting the dataset

import statsmodels.api as sm     # For logistic regression and statistical analysis


In [None]:
# assign the dataset to a variable !!! change the path to the location of the dataset on your machine!!!
data = pd.read_csv('diabetes_merged.csv')

# Preview the first few rows of the dataset
print("First 5 rows of the dataset:")
display(data.head())

# Check the shape (rows, columns)
print("\nDataset shape (rows, columns):", data.shape)

# Check column names
print("\nColumn names:", data.columns.tolist())

# Check the data types of each column
print("\nData types:")
display(data.dtypes)

# Check for missing values
print("\nMissing values in each column:")
display(data.isnull().sum())

# Get basic descriptive statistics
print("\nDescriptive statistics:")
display(data.describe())


In [None]:
# Check for missing values in each column
print("Missing values per column:")
print(data.isnull().sum())

**There are no missing values it seems like, making it easy**


**HANDLING OUTLIERS**
we chose to move the outliers to the bounds

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# List of numeric columns (excluding the target 'Outcome')
num_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for col in num_cols:
    # 1. Visualize current distribution with a boxplot
    plt.figure(figsize=(5, 3))
    sns.boxplot(x=data[col], color='skyblue')
    plt.title(f"{col} - Boxplot Before Outlier Handling")
    plt.show()
    
    # 2. Calculate IQR, lower, and upper bounds
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    print(f"{col} lower bound: {lower_bound:.2f}, upper bound: {upper_bound:.2f}")
    
    # 3. Cap (clip) the outliers
    data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])
    data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])
    
    # 4. Check the distribution again after capping
    plt.figure(figsize=(5, 3))
    sns.boxplot(x=data[col], color='lightgreen')
    plt.title(f"{col} - Boxplot After Outlier Handling")
    plt.show()

print("Outlier handling complete.")


# Is data balanced?

In [None]:
# Count the occurrences of each class in the target variable
class_counts = data['Outcome'].value_counts()
print("Class distribution:")
print(class_counts)

# Calculate the percentage of each class
class_percentages = data['Outcome'].value_counts(normalize=True) * 100
print("\nClass percentages:")
print(class_percentages)

# Visualize the distribution using a countplot
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Outcome', data=data, palette='pastel')
plt.title('Distribution of Outcome')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()


## After handling outliers, the next step is to prepare your data for modeling

Separating Features and Target:
Extract the independent variables (features) and the dependent variable (target).  drop the target column (e.g., Outcome) from the features.

Splitting the Data:
Divide dataset into training and testing sets. This helps train your model on one portion of the data and evaluate its performance on unseen data.

**DATA PROCESSING & SPLITTING**

In [None]:
# 'Outcome' is the target variable and all other columns are features ALSO THE DEPENDENT VARIABLE
X = data.drop("Outcome", axis=1)  # Extracting features by dropping the target column
y = data["Outcome"]               # Extracting the target variable

# Print out the shapes of the resulting feature set and target
print("Features shape (X):", X.shape)
print("Target shape (y):", y.shape)

**SPLITTING DATA FOR TRAINING**

In [None]:
from sklearn.model_selection import train_test_split

# Split the data: 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Confirm the split by printing the shapes of the resulting datasets
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)


**check for 0 values**

In [None]:
data.head(5)

In [None]:
#checks for 0 values in the dataset as they are missing values, except in outcome

print("Check how many other missing (zero) values\n")

print("total number of rows: {}".format(len(data)))
print("number of rows missing Glucose: {}".format(len(data.loc[data['Glucose'] == 0])))
print("number of rows missing BloodPressure: {}".format(len(data.loc[data['BloodPressure'] == 0])))
print("number of rows missing SkinThickness: {}".format(len(data.loc[data['SkinThickness'] == 0])))
print("number of rows missing Insulin: {}".format(len(data.loc[data['Insulin'] == 0])))
print("number of rows missing BMI: {}".format(len(data.loc[data['BMI'] == 0])))
print("number of rows missing DiabetesPedigreeFunction: {}".format(len(data.loc[data['DiabetesPedigreeFunction'] == 0])))
print("number of rows missing Age: {}".format(len(data.loc[data['Age'] == 0])))


In [None]:
# Columns in which a zero value is considered invalid (missing)
columns_with_zero_missing = [
    'Glucose', 
    'BloodPressure', 
    'SkinThickness', 
    'Insulin', 
    'BMI'
]

for col in columns_with_zero_missing:
    # Calculate the mean of the column, excluding zeros
    mean_value = data[data[col] != 0][col].mean()
    
    # Replace zeros with the calculated mean
    data[col] = data[col].replace(0, mean_value)

# Quick check to confirm no zeros remain in these columns
for col in columns_with_zero_missing:
    zero_count = (data[col] == 0).sum()
    print(f"Number of zeros in {col} after replacement: {zero_count}")


## Step D: Feature Importance Analysis with Random Forest

Model Training:
A RandomForestClassifier is created and trained using X_train and y_train.

Extracting Importances:
The feature_importances_ attribute gives a score for each feature. These scores indicate how much each feature contributes to the model’s predictions.

Displaying the Results:
A DataFrame is created to neatly display and sort the features by importance.

Visualization:
A horizontal bar chart is generated to visualize the feature importances, making it easier to compare the contribution of each feature.

This step will help you understand which variables play the most significant roles in predicting diabetes. After this, you can move on to further analysis—like using logistic regression to statistically test the significance of specific features such as BMI.

In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Create and train a Random Forest Classifier using the training set
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Extract feature importances from the trained model
feature_importances = rf.feature_importances_
features = X.columns

# Create a DataFrame to display and sort the feature importances
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

# Visualize the feature importances with a horizontal bar chart
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance from Random Forest")
plt.gca().invert_yaxis()  # Display the most important feature at the top
plt.tight_layout()
plt.show()


**still have to do linear regression and EVALUATION**