In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import os

In [None]:
# STEP 1: Load dataset with error handling
def load_dataset_safely(from_sklearn=True, path=None):
    try:
        if from_sklearn:
            iris = load_iris(as_frame=True)
            df = iris.frame.copy()
            # add a human-readable species column
            df['species'] = df['target'].apply(lambda x: iris.target_names[x])
            print("Loaded Iris from sklearn.datasets — shape:", df.shape)
            return df, iris
        else:
            if path is None:
                raise ValueError("If from_sklearn=False you must provide path to CSV.")
            df = pd.read_csv(path)
            print("Loaded CSV from", path, "— shape:", df.shape)
            return df, None
    except FileNotFoundError:
        print("Error: dataset file not found. Check your path:", path)
        raise
    except Exception as e:
        print("Unexpected error when loading dataset:", e)
        raise

df, iris_meta = load_dataset_safely(from_sklearn=True)

# STEP 2: Quick look & basic info
print("\nSTEP 2 — Quick look & basic info\n" + "-"*60)
if _HAS_DISPLAY_HELPER:
    try:
        display_dataframe_to_user("Iris - first 8 rows", df.head(8))
    except Exception:
        print(df.head(8))
else:
    print(df.head(8))

print('\nDataframe info:')
df.info()  # prints summary
print(df.describe(include='all').T)

# STEP 3: Missing values & simple cleaning
print("\nSTEP 3 — Missing values and simple cleaning\n" + "-"*60)
missing = df.isnull().sum()
print(missing)
if missing.sum() == 0:
    print("No missing values found in this dataset.")
else:
    print("If missing values existed you could fill them like this (example):\n df = df.fillna(df.mean()) or df = df.dropna()")

# STEP 4: Basic EDA (counts and distributions)
print("\nSTEP 4 — Basic EDA: counts & distributions\n" + "-"*60)
print("Species counts:\n", df['species'].value_counts())

# Histogram: Sepal length distribution
print('\nPlot 1: Histogram of Sepal Length (one feature)')
plt.figure()
plt.hist(df['sepal length (cm)'], bins=15)
plt.title('Distribution of Sepal Length (cm)')
plt.xlabel('Sepal length (cm)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Scatter: Sepal length vs Sepal width colored by species (matplotlib auto color cycling)
print('\nPlot 2: Scatter — Sepal length vs Sepal width (by species)')
plt.figure()
for species in df['species'].unique():
    subset = df[df['species'] == species]
    plt.scatter(subset['sepal length (cm)'], subset['sepal width (cm)'], label=species)
plt.title('Sepal length vs Sepal width by species')
plt.xlabel('Sepal length (cm)')
plt.ylabel('Sepal width (cm)')
plt.legend()
plt.tight_layout()
plt.show()

# Boxplot grouped by species: Sepal length
print('\nPlot 3: Boxplot — Sepal length distribution by species')
plt.figure()
data_to_plot = [df[df['species'] == s]['sepal length (cm)'] for s in df['species'].unique()]
plt.boxplot(data_to_plot, labels=df['species'].unique())
plt.title('Sepal length distribution by species')
plt.ylabel('Sepal length (cm)')
plt.tight_layout()
plt.show()

# Correlation matrix (imshow)
print('\nPlot 4: Correlation matrix of numeric features')
corr = df.drop(columns=['target','species']).corr()
plt.figure()
plt.imshow(corr, interpolation='none')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha='right')
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.title('Correlation matrix of features')
plt.tight_layout()
plt.show()

# STEP 5: Short modeling demo (end-to-end)
print("\nSTEP 5 — Quick classification demo (end-to-end)\n" + "-"*60)
X = df.drop(columns=['target','species'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
model = LogisticRegression(max_iter=200)
try:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy on test set:', accuracy_score(y_test, y_pred))
    print('\nClassification report:')
    print(classification_report(y_test, y_pred, target_names=iris_meta.target_names))
except Exception as e:
    print('Error training/evaluating model:', e)

# Save a reusable template file for the user
out_path = '/mnt/data/iris_analysis_template.py'
save_template_file(out_path)
print(f"\nSaved a reusable template to: {out_path}")