In [None]:
def preprocess_titanic_data(df, is_train=True):
    """
    Fungsi untuk membersihkan dan memproses data Titanic
    """
    # Buat salinan dataframe
    df_processed = df.copy()

    # 1. Feature Engineering
    df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    df_processed['IsAlone'] = (df_processed['FamilySize'] == 1).astype(int)

    # 2. Handle missing values
    df_processed['Age'].fillna(df_processed['Age'].median(), inplace=True)
    df_processed['Fare'].fillna(df_processed['Fare'].median(), inplace=True)
    df_processed['Embarked'].fillna(df_processed['Embarked'].mode()[0], inplace=True)

    # 3. Encode categorical features
    le = LabelEncoder()
    df_processed['Sex'] = le.fit_transform(df_processed['Sex'])
    df_processed['Embarked'] = le.fit_transform(df_processed['Embarked'].astype(str))

    # 4. Extract Title from Name
    df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df_processed['Title'] = df_processed['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don',
                                                         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df_processed['Title'] = df_processed['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Mme', 'Mrs')
    df_processed['Title'] = le.fit_transform(df_processed['Title'])

    # 5. Drop columns yang tidak diperlukan
    columns_to_drop = ['Name', 'Ticket', 'Cabin']
    df_processed.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

    # 6. Pisahkan features dan target untuk data training
    if is_train and 'Survived' in df_processed.columns:
        y = df_processed['Survived']
        X = df_processed.drop('Survived', axis=1)
        return X, y

    return df_processed

# Proses data training
X_train, y_train = preprocess_titanic_data(train_data, is_train=True)

# Proses data test
X_test = preprocess_titanic_data(test_data, is_train=False)

# Gabungkan gender_submission dengan test data untuk evaluasi
if not gender_data.empty and not test_data.empty:
    test_results = pd.merge(
        test_data[['PassengerId']],
        gender_data,
        on='PassengerId',
        how='left'
    )
    y_test = test_results['Survived'] if 'Survived' in test_results.columns else None
else:
    y_test = None

print("\nData setelah preprocessing:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
if y_test is not None:
    print("y_test shape:", y_test.shape)


Data setelah preprocessing:
X_train shape: (891, 11)
y_train shape: (891,)
X_test shape: (418, 11)
y_test shape: (418,)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Age'].fillna(df_processed['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Fare'].fillna(df_processed['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the 