In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
# Load the Titanic dataset
df1 = pd.read_csv("titanic_dataset.csv")

# Handle missing values (for simplicity, we'll just drop rows with missing values)
df=df1.dropna(inplace=True)

# Feature engineering (You can customize this based on the available features and their relevance)
# Let's assume you choose the following features for the model
selected_features = ['Age', 'Fare','Pclass', 'Sex' ]

# Convert categorical variables to numeric (e.g., "male" -> 0, "female" -> 1)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Select features and target variable
X = df[selected_features]
y = df['Survived']

# Step 2: Model Training
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 3: Predictions
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Step 4: Generate Output File
# Load the test set (assuming the test set doesn't contain the 'Survived' column)
test_data = pd.read_csv("test.csv")
passenger_names = test_data['Name']

# Perform the same feature engineering steps as done for the training data
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
X_submit = test_data[selected_features]

from sklearn.impute import SimpleImputer

# Handle missing values in the test data using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # You can choose a different imputation strategy if needed
X_submit = imputer.fit_transform(X_submit)

# Make predictions on the test set
y_submit = model.predict(X_submit)

# Step 4: Generate Output File
# Create a DataFrame to store the passenger names and their survival predictions
submission_df = pd.DataFrame({'PassengerName': test_data['Name'], 'Survived': y_submit})

# Save the output to a CSV file
submission_df.to_csv("titanic_predictions.csv", index=False)


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values