In [None]:
# Data loading + clean EDA
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

csv_path = 'sample_iris.csv'
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
else:
    from sklearn.datasets import load_iris
    iris = load_iris(as_frame=True)
    df = iris.frame
    df.columns = list(iris.feature_names) + ['target']

print('Shape:', df.shape)
display(df.head())
print('\nDtypes:\n', df.dtypes)
print('\nSummary statistics:')
display(df.describe(include='all'))
print('\nMissing values per column:')
print(df.isnull().sum())

# Histograms for numeric columns
numeric = df.select_dtypes(include=[np.number])
numeric.hist(figsize=(10,8))
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(6,5))
sns.heatmap(numeric.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()

# Pairplot (may take some time)
sns.pairplot(df, vars=numeric.columns, hue='target')
plt.show()

# Capstone Project - Exploratory Data Analysis
This notebook contains data loading, brief EDA, modeling, and report export steps. The corrupted EDA cell was removed and replaced with a clean EDA cell.

In [None]:
# Modeling: preprocessing, baseline model training, evaluation, and save
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure dataframe `df` exists (loaded by previous EDA cell)
try:
    df
except NameError:
    import os
    if os.path.exists('sample_iris.csv'):
        df = pd.read_csv('sample_iris.csv')
    else:
        from sklearn.datasets import load_iris
        iris = load_iris(as_frame=True)
        df = iris.frame
        df.columns = list(iris.feature_names) + ['target']

def preprocess_for_model(df_in, target_col='target', drop_thresh=0.5, max_card_onehot=10):
    df = df_in.copy()
    # Drop columns with too many missing values
    drop_cols = [c for c in df.columns if df[c].isnull().mean() > drop_thresh]
    if drop_cols:
        print('Dropping columns with high missing rate:', drop_cols)
        df = df.drop(columns=drop_cols)
    # Impute numeric with median, categorical with mode
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    for c in num_cols:
        df[c] = df[c].fillna(df[c].median())
    for c in cat_cols:
        mode = df[c].mode()
        fill = mode.iloc[0] if not mode.empty else 'missing'
        df[c] = df[c].fillna(fill)
    # Encode categoricals: one-hot for low cardinality, label for high
    from sklearn.preprocessing import LabelEncoder
    for c in list(cat_cols):
        if df[c].nunique() <= max_card_onehot:
            df = pd.get_dummies(df, columns=[c], prefix=c, drop_first=True)
        else:
            le = LabelEncoder()
            df[c] = le.fit_transform(df[c].astype(str))
    return df

print('Preprocessing...')
cleaned = preprocess_for_model(df, target_col='target')
# Prepare features and target
if 'target' in cleaned.columns:
    y = cleaned['target']
    X = cleaned.drop(columns=['target'])
else:
    y = cleaned.iloc[:, -1]
    X = cleaned.iloc[:, :-1]

# Keep numeric features for baseline pipeline
X_numeric = X.select_dtypes(include=[np.number]).copy()
if X_numeric.shape[1] != X.shape[1]:
    dropped = set(X.columns) - set(X_numeric.columns)
    print('Dropping non-numeric columns for baseline model:', dropped)

# Train/test split
stratify = y if len(np.unique(y)) > 1 else None
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, stratify=stratify)

# Fit baseline pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, multi_class='multinomial', solver='lbfgs'))
pipe.fit(X_train, y_train)

# Evaluate
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Baseline model accuracy: {acc:.4f}')
print(classification_report(y_test, y_pred))

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Baseline')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

# Save model
model_path = 'baseline_model.pkl'
joblib.dump(pipe, model_path)
print(f'Saved baseline model to {model_path}')

In [5]:
# Run the exported script to generate Markdown and HTML reports
print('Running export_results.py from notebook kernel')
exec(open('export_results.py', 'r', encoding='utf-8').read())

Running export_results.py from notebook kernel
Output directory: c:\Users\user\Downloads\Capstone project
Saved confusion matrix to c:\Users\user\Downloads\Capstone project\confusion_matrix.png
Wrote Markdown report to c:\Users\user\Downloads\Capstone project\results_report.md
Wrote HTML report to c:\Users\user\Downloads\Capstone project\results_report.html

Done.
