In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

train_set = pd.read_csv("/kaggle/input/titanic/train.csv")
test_set = pd.read_csv("/kaggle/input/titanic/test.csv")

# Save original PassengerId for submission
test_passenger_ids = test_set['PassengerId']

train_set.head()

# 2. Drop Columns
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train_set.drop(columns=drop_cols, inplace=True)
test_set.drop(columns=drop_cols, inplace=True)

# 3. Fill Missing Values
train_set['Age'].fillna(train_set['Age'].median(), inplace=True)
train_set['Embarked'].fillna(train_set['Embarked'].mode()[0], inplace=True)
train_set['Fare'].fillna(train_set['Fare'].median(), inplace=True)

test_set['Age'].fillna(test_set['Age'].median(), inplace=True)
test_set['Embarked'].fillna(test_set['Embarked'].mode()[0], inplace=True)
test_set['Fare'].fillna(test_set['Fare'].median(), inplace=True)


# 4. Feature Engineering
train_set['Sex'] = train_set['Sex'].map({'male': 0, 'female': 1})
test_set['Sex'] = test_set['Sex'].map({'male': 0, 'female': 1})

train_set = pd.get_dummies(train_set, columns=['Embarked'], drop_first=True)
test_set = pd.get_dummies(test_set, columns=['Embarked'], drop_first=True)

train_set['FamilySize'] = train_set['SibSp'] + train_set['Parch']
test_set['FamilySize'] = test_set['SibSp'] + test_set['Parch']


# 5. Match Columns (align dummy variables)
missing_cols = set(train_set.columns) - set(test_set.columns) - {'Survived'}
for col in missing_cols:
    test_set[col] = 0
test_set = test_set[train_set.drop('Survived', axis=1).columns]



# 6. Prepare Data
X = train_set.drop('Survived', axis=1)
y = train_set['Survived']


# Optional: Split for local validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



# 7. Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# 8. Evaluate
val_preds = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))


test_preds = model.predict(test_set)
print(test_preds[:10])



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_set['Age'].fillna(train_set['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_set['Embarked'].fillna(train_set['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

Accuracy: 0.8212290502793296
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

[0 0 0 1 0 0 0 0 1 0]
