In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
# Load training data
df_train = pd.read_csv('/content/train.csv')

In [None]:
# Preprocess training data
df_train = df_train.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_train = pd.concat([df_train, pd.get_dummies(df_train['Embarked'], prefix='Embarked')], axis=1)
df_train = df_train.drop(columns='Embarked')

In [None]:
# Handle missing values in 'Age'
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())

# Normalize 'Age' and 'Fare'
age_mean = df_train['Age'].mean()
age_std = df_train['Age'].std()
df_train['Age'] = (df_train['Age'] - age_mean) / age_std

fare_mean = df_train['Fare'].mean()
fare_std = df_train['Fare'].std()
df_train['Fare'] = (df_train['Fare'] - fare_mean) / fare_std

# Prepare training data
X_train = df_train.drop(columns='Survived')
y_train = df_train['Survived']

# Convert labels to one-hot encoding
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

# Convert data to numpy arrays and float32
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)


In [None]:

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),  # Regularization
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(2, activation='softmax')  # Output layer with softmax for probabilities
])

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print("Test accuracy:", accuracy * 100, "%")

# Load test data
df_test = pd.read_csv('/content/test.csv')

# Preserve 'PassengerId' for final output
passenger_ids = df_test['PassengerId']

# Preprocess test data
df_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin'])
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')], axis=1)
df_test = df_test.drop(columns='Embarked')

# Handle missing values in 'Age' and 'Fare'
df_test['Age'] = df_test['Age'].fillna(age_mean)  # Use the mean from training data
df_test['Fare'] = df_test['Fare'].fillna(fare_mean)  # Use the mean from training data

# Normalize 'Age' and 'Fare'
df_test['Age'] = (df_test['Age'] - age_mean) / age_std
df_test['Fare'] = (df_test['Fare'] - fare_mean) / fare_std

# Ensure the columns in test set match the training set
train_columns = df_train.drop(columns='Survived').columns
missing_cols = set(train_columns) - set(df_test.columns)
for c in missing_cols:
    df_test[c] = 0
df_test = df_test[train_columns]

# Convert test data to numpy array and float32
X_test = np.array(df_test, dtype=np.float32)

# Predict labels for test data
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)  # Get class indices

# Create output for Kaggle submission
output = pd.DataFrame()
output['PassengerId'] = passenger_ids
output['Survived'] = predicted_classes
output.to_csv('./prediction.csv', index=False)

# Close the TensorFlow session (optional, but good practice)
tf.keras.backend.clear_session()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 83.2402229309082 %
