In [15]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
# Load
def load_data():
    train_df = pd.read_csv("datasets/train.csv")
    test_df = pd.read_csv("datasets/test.csv")
    return train_df, test_df

In [17]:
# Preprocess
def preprocess_data(df, is_train=True):
    df = df.copy()
    
    # Fill missing age values with median
    age_imputer = SimpleImputer(strategy="median")
    df["Age"] = age_imputer.fit_transform(df[["Age"]])
    
    # Fill missing embarked values with mode
    embarked_imputer = SimpleImputer(strategy="most_frequent")
    df["Embarked"] = embarked_imputer.fit_transform(df[["Embarked"]])
    
    # Convert categorical to numerical
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
    df = pd.get_dummies(df, columns=["Embarked"], drop_first=True)
    
    # Create new features
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)
    
    return df

In [18]:
# Training
def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

In [19]:
# Evaluation
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("Classification Report:\n", classification_report(y_val, y_pred))
    sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt="d", cmap="Blues")
    plt.show()

In [20]:
# Exec
train_df, test_df = load_data()
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df, is_train=False)

ValueError: 2

In [None]:
# Split
X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train/evaluate
model = train_model(X_train, y_train)
evaluate_model(model, X_val, y_val)

In [None]:
# Test
test_predictions = model.predict(test_df)
pd.DataFrame({"PassengerId": test_df.index, "Survived": test_predictions}).to_csv("submission.csv", index=False)