In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load dataset
df = sns.load_dataset("titanic")

# Step 2: Drop irrelevant and high-NaN columns
df = df.drop(columns=["deck", "embark_town", "alive", "class", "who", "adult_male", "alone"])

# Step 3: Drop rows with missing values
df = df.dropna()

# Step 4: Encode categorical columns
label_enc = LabelEncoder()
df['sex'] = label_enc.fit_transform(df['sex'])  # male=1, female=0
df['embarked'] = label_enc.fit_transform(df['embarked'])  # C, Q, S
df['embarked'] = df['embarked'].astype(int)

# Step 5: Select features and target
X = df.drop(columns=['survived'])
y = df['survived']

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 8: Evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

# Output
print("Shape after cleaning:", df.shape)
print("Accuracy:", acc)


Shape after cleaning: (712, 8)
Accuracy: 0.7692307692307693
