In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Split data
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)

# Default model with n_estimators=10
rf_default = RandomForestClassifier(n_estimators=10, random_state=42)
rf_default.fit(X_train_iris, y_train_iris)
default_score = rf_default.score(X_test_iris, y_test_iris)
print(f"Default n_estimators=10 Score: {default_score:.4f}")

# Fine-tune n_estimators
scores = {}
for n in range(1, 101):
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    rf.fit(X_train_iris, y_train_iris)
    scores[n] = rf.score(X_test_iris, y_test_iris)

best_n = max(scores, key=scores.get)
best_score = scores[best_n]
print(f"Best n_estimators={best_n} Score: {best_score:.4f}")



Default n_estimators=10 Score: 1.0000
Best n_estimators=1 Score: 1.0000


In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix

# Load Titanic data
train_df = pd.read_csv("train.csv")

# Drop irrelevant columns
train_df_cleaned = train_df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Features and target
X = train_df_cleaned.drop(columns="Survived")
y = train_df_cleaned["Survived"]

# Categorical and numerical columns
categorical_cols = ["Sex", "Embarked"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
numerical_transformer = SimpleImputer(strategy="mean")
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# Full pipeline
clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate
clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"\nTitanic Dataset Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)



Titanic Dataset Accuracy: 0.8156
Confusion Matrix:
[[91 14]
 [19 55]]
