In [None]:
# -------- Import Libraries ----------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

# -------- Load Dataset ----------
df = pd.read_csv("E:\ML Model\IRIS.csv")

# -------- Dataset Annotations (Metadata Dictionary) ----------
dataset_annotations = {

    "Species": "Type of penguin species (Adelie, Chinstrap, Gentoo)",
    "Region": "Region where the data was collected",
    "Island": "Specific island of penguin habitat",
    "Stage": "Life stage (Adult or Egg stage)",
    "Clutch Completion": "Whether the penguin completed a clutch (Yes/No)",
    "Culmen Length (mm)": "Bill length measured in millimeters",
    "Culmen Depth (mm)": "Bill depth measured in millimeters",
    "Flipper Length (mm)": "Flipper length measured in millimeters",
    "Body Mass (g)": "Body weight in grams",
    "Sex": "Biological sex of penguin (Male/Female)",
    "Delta 15 N (o/oo)": "Stable nitrogen isotope measurement (diet indicator)",
    "Delta 13 C (o/oo)": "Stable carbon isotope measurement (foraging area indicator)"
}

print("\n===== Dataset Annotation Guide =====")
for column, meaning in dataset_annotations.items():
    print(f"{column} → {meaning}")

# -------- Preview Data ----------
print("\n===== Dataset Preview =====")
print(df.head())

print("\n===== Missing Values =====")
print(df.isna().sum())

# -------- Drop Irrelevant Columns ----------
drop_cols = ['studyName','Sample Number','Date Egg','Individual ID','Comments']
df = df.drop(columns=drop_cols, errors='ignore')

# -------- Handle Missing Values ----------
df = df.dropna(subset=["Sex","Body Mass (g)","Flipper Length (mm)","Culmen Length (mm)"])

# -------- Label Encoding ----------
le = LabelEncoder()
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Showing encoded meaning for Sex as annotation
print("\nEncoding Note: Sex has been encoded as → Female = 0, Male = 1")

print("\n===== Cleaned Dataset =====")
print(df.head())

# -------- Visualization ----------
plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x="Flipper Length (mm)", y="Body Mass (g)", hue="Sex")
plt.title("Flipper Length vs Body Mass")
plt.show()

plt.figure(figsize=(7,5))
sns.boxplot(data=df, x="Species", y="Body Mass (g)")
plt.title("Body Mass Distribution per Species")
plt.show()

# -------- Model: Predict Sex ----------
X = df.drop(columns=["Sex"])
y = df["Sex"]

# Scaling numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Predictions
pred = model.predict(X_test)

print("\n===== Model Performance =====")
print("Accuracy Score:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

# -------- Confusion Matrix ----------
cm = confusion_matrix(y_test, pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# -------- Save Model ----------
joblib.dump(model, "penguin_model.pkl")
print("\nModel saved as penguin_model.pkl")


  df = pd.read_csv("E:\ML Model\IRIS.csv")
  df = pd.read_csv("E:\ML Model\IRIS.csv")


ModuleNotFoundError: No module named 'pandas'

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
df = pd.read_csv("E:/ML Model/IRIS.csv")
print(df.head())
print(df.isna().sum())
drop_cols = ['studyName','Sample Number','Date Egg','Individual ID','Comments']
df = df.drop(columns=drop_cols)

df = df.dropna(subset=[
    "Sex",
    "Body Mass (g)",
    "Flipper Length (mm)",
    "Culmen Length (mm)"
])
label = LabelEncoder()

df['Species'] = label.fit_transform(df['Species'])
df['Region'] = label.fit_transform(df['Region'])
df['Island'] = label.fit_transform(df['Island'])
df['Stage'] = label.fit_transform(df['Stage'])
df['Clutch Completion'] = label.fit_transform(df['Clutch Completion'])
df['Sex'] = label.fit_transform(df['Sex'])
sns.scatterplot(
    data=df,
    x="Flipper Length (mm)",
    y="Body Mass (g)",
    hue="Sex"
)
plt.show()
X = df.drop(columns=["Sex"])  # Input features
y = df["Sex"]                 # Target variable
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))



ModuleNotFoundError: No module named 'pandas'