In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

file_path = '/Users/aslinuraksakal/EE473/diabetes_dataset.csv'
try:
    df = pd.read_csv(file_path)
    print(">>> Dataset loaded successfully.")
except FileNotFoundError:
    # Fallback: look in the current directory if the absolute path fails
    try:
        df = pd.read_csv('diabetes_dataset.csv')
        print(">>> Dataset loaded successfully (from current directory).")
    except FileNotFoundError:
        print(f"ERROR: The file 'diabetes_dataset.csv' was not found at:")
        print(f"-> {file_path}")
        print("Please check if the file exists in that folder.")
        exit()

# --- 2. CALCULATE TOP 10 PARAMETERS ---
# Select only numerical columns (Categorical data cannot be correlated directly)
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Target variable: diagnosed_diabetes
# We calculate the absolute correlation of all columns with the target.
# abs() is used because negative correlation (e.g., more exercise -> less diabetes) is also important.
correlations = numeric_df.corrwith(df['diagnosed_diabetes']).abs()

# Remove the target itself (diagnosed_diabetes vs diagnosed_diabetes = 1.0)
correlations = correlations.drop('diagnosed_diabetes')

# Remove Risk Score if present, as it is a calculated result, not a raw parameter
if 'diabetes_risk_score' in correlations:
    correlations = correlations.drop('diabetes_risk_score')

# Find the top 10 highest correlations and sort them
top_10 = correlations.sort_values(ascending=False).head(10)

print("\n>>> Top 10 Most Influential Medical Parameters:")
print(top_10)

# --- 3. VISUALIZATION (Horizontal Bar Chart) ---
plt.figure(figsize=(12, 7))

# Create the bar plot
ax = sns.barplot(x=top_10.values, y=top_10.index, palette='viridis', hue=top_10.index, legend=False)

# Chart Aesthetics
plt.title('Top 10 Most Influential Medical Parameters for Diabetes Diagnosis', fontsize=16, fontweight='bold')
plt.xlabel('Correlation Coefficient (Strength of Relationship)', fontsize=12)
plt.ylabel('Medical Parameters', fontsize=12)

# Add values to the end of each bar for better readability
for i in ax.containers:
    ax.bar_label(i, fmt='%.3f', padding=3)

# Adjust layout
plt.tight_layout()

# --- KRİTİK DÜZELTME BURADA ---
# ÖNCE kaydet (bbox_inches='tight' kenarların kesilmesini önler)
plt.savefig('top10_diabetes_params.png', dpi=300, bbox_inches='tight')
print(">>> Plot saved successfully as 'top10_diabetes_params.png'")

# SONRA göster (Bu komut hafızayı temizler)
plt.show()

NameError: name 'df' is not defined