In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense

In [34]:

# Step 1: Read in Titanic.csv and preprocess the data
df = pd.read_csv('/content/sample_data/Titanic.csv')

In [35]:
# Fill missing values with the mean of the same feature
df['age'].fillna(df['age'].mean(), inplace=True)
df['fare'].fillna(df['fare'].mean(), inplace=True)

In [36]:
# Convert categorical features to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'embarked'], drop_first=True)
class_mapping = {'1st': 1, '2nd': 2, '3rd': 3}
df['pclass'] = df['pclass'].map(class_mapping)

In [37]:
# Split data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [38]:
# Reset index for training and test sets
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [39]:
# Step 2: Fit a neural network
# Prepare input features and target variable for training set
X_train = train_df[['pclass', 'sex_male', 'age', 'sibsp']].values
y_train = train_df['survived'].values


In [40]:
# Prepare input features and target variable for test set
X_test = test_df[['pclass', 'sex_male', 'age', 'sibsp']].values
y_test = test_df['survived'].values


In [41]:
# Scale the input features using standard scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [42]:
# Build neural network model
model = Sequential()
model.add(Dense(units=16, activation='sigmoid', input_dim=4))
model.add(Dense(units=8, activation='sigmoid'))
model.add(Dense(units=1, activation='sigmoid'))


In [43]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [44]:

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
# Step 3: Evaluate the model
# Predict on test set
y_test_pred_probs = model.predict(X_test)

# Convert probabilities to binary predictions using threshold of 0.5
y_pred = (y_train_pred_probs >= 0.5).astype(int)
y_pred = y_pred.flatten()
y_pred = y_pred.flatten()



In [45]:
# Calculate out-of-sample accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Out-of-sample Accuracy: {:.2f}%".format(accuracy * 100))

Out-of-sample Accuracy: 74.81%


In [46]:
# Step 4: Compare with Random Forest
# Prepare input features for random forest
X_train_rf = train_df[['pclass', 'sex_male', 'age', 'sibsp']].values
X_test_rf = test_df[['pclass', 'sex_male', 'age', 'sibsp']].values

In [47]:
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_rf, y_train)


In [48]:
# Predict on test set
y_pred_rf = rf.predict(X_test_rf)

# Calculate out-of-sample accuracy for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Out-of-sample Accuracy (Random Forest): {:.2f}%".format(accuracy_rf * 100))

Out-of-sample Accuracy (Random Forest): 75.57%


bonus question


In [64]:




# Step 2: Define the neural network with backpropagation
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Function to initialize weights
def initialize_weights(input_size, hidden_sizes, output_size):
    np.random.seed(0)
    weights = []
    layer_sizes = [input_size] + hidden_sizes + [output_size]
    for i in range(len(layer_sizes)-1):
        weight = np.random.uniform(low=-0.1, high=0.1, size=(layer_sizes[i], layer_sizes[i+1]))
        weights.append(weight)
    return weights

# Function to forward propagate
def forward_propagate(X, weights):
    activations = [X]
    for i in range(len(weights)):
        activation = sigmoid(np.dot(activations[-1], weights[i]))
        activations.append(activation)
    return activations

# Function to compute loss
def compute_loss(y_true, y_pred):
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Function to backward propagate
def backward_propagate(X, y_true, activations, weights):
    gradients = []
    delta = activations[-1] - y_true
    gradients.append(np.dot(activations[-2].T, delta))
    for i in range(len(weights)-1, 0, -1):
        delta = np.dot(delta, weights[i].T) * sigmoid_derivative(activations[i])
        gradients.append(np.dot(activations[i-1].T, delta))
    return gradients[::-1]

# Function to update weights
def update_weights(weights, gradients, learning_rate):
    for i in range(len(weights)):
        weights[i] -= learning_rate * gradients[i]
    return weights

# Set hyperparameters
input_size = X_train_scaled.shape[1]
hidden_sizes = [16, 8]  # Example hidden layer sizes
output_size = 1
learning_rate = 0.01
num_iterations = 500

In [69]:
# Initialize weights
weights = initialize_weights(input_size, hidden_sizes, output_size)

# Training loop
for i in range(num_iterations):
    y_train_array = y_train.values.reshape(-1, 1)

    # Perform forward propagation
    activations, caches = forward_propagate(X_train_scaled, weights)

    # Compute loss
    y_pred = activations[-1]
    loss = compute_loss(y_train_array, y_pred)

    # Backward propagate
    grads = backward_propagation(y_train_array, activations, caches)

    # Update weights
    weights = update_weights(weights, gradients, learning_rate)

    # Print loss for tracking progress
    if i % 100 == 0:
        print(f'Iteration: {i}, Loss: {loss:.4f}')

# Step 3: Evaluate the model
# Forward propagate on test set
activations_test = forward_propagate(X_test_scaled, weights)
y_pred_test = activations_test[-1]
y_pred_test = np.round(y_pred_test)  # Convert probabilities to binary predictions

# Calculate accuracy on test set
accuracy = accuracy_score(y_test, y_pred_test)
print(f'Accuracy on test set: {accuracy:.4f}')


ValueError: ignored