In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from google.colab import files
uploaded1 = files.upload()
uploaded2 = files.upload()
file_name1 = list(uploaded1.keys())[0]
file_name2 = list(uploaded2.keys())[0]
# Load both files (adjust path if needed)
train = pd.read_csv(file_name1)   # or your local path
test  = pd.read_csv(file_name2)

# Quick common preprocessing (do on both!)
def preprocess(df):
    # Title extraction
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Age imputation (better: group by Title + Pclass)
    df['Age'] = df.groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
    df['Age'] = df['Age'].fillna(df['Age'].median())  # fallback

    # Fare (only 1 missing in test)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Embarked
    df['Embarked'] = df['Embarked'].fillna('S')

    # Drop useless/high-missing
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # One-hot encoding
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Pclass', 'Title'], drop_first=True)

    return df

train = preprocess(train)
test  = preprocess(test)   # test has no Survived

# Now split train
X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate
preds_val = model.predict(X_val)
print(classification_report(y_val, preds_val))
print("Validation accuracy:", accuracy_score(y_val, preds_val))

# Predict on real test set
X_test = test   # already processed
test_preds = model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': pd.read_csv(file_name2)['PassengerId'],
    'Survived': test_preds
})
submission.to_csv('submission_rf_basic.csv', index=False)
print("Submission file created!")

Saving train (1).csv to train (1) (2).csv


Saving test (1).csv to test (1) (2).csv
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       105
           1       0.78      0.84      0.81        74

    accuracy                           0.84       179
   macro avg       0.83      0.84      0.83       179
weighted avg       0.84      0.84      0.84       179

Validation accuracy: 0.8379888268156425
Submission file created!
