let's do it properly this time!

In [1]:
import torch
from pathlib import Path
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

torch.manual_seed(442)

np.printoptions(linewidth=140)
torch.set_printoptions(linewidth=140, sci_mode=False, edgeitems=7)
pd.set_option("display.width", 140)

In [2]:
path = Path("./input")
train_path = path/"train.csv"
trn_df = pd.read_csv(train_path)

nothing to do with the data anymore, i don't think do some feature engineering here will help improve  our model though, it will add some noises, remember "sometimes less is more!"

In [3]:
def preprocess_data(df):
    age_imputer = KNNImputer(n_neighbors=5)
    df["Age"] = age_imputer.fit_transform(df[["Age", "Pclass", "SibSp", "Parch"]])[:, 0]
    df["LogFare"] = np.log1p(df["Fare"])
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    rare_titles = ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, np.inf], labels=[1, 2, 3, 4, 5])
    
    # One-hot encoding
    df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked', 'AgeBin'], drop_first=True, dtype=float)
    return df

# Preprocess training data
trn_df = preprocess_data(trn_df)

def get_columns(name):
    return [col for col in trn_df.columns if col.startswith(name)]
added_cols = get_columns(("Sex_", "Pclass", "Embarked_", "AgeBin_"))

indep_cols = ['Title', 'Age', 'SibSp', 'Parch', 'LogFare', 'FamilySize', 'IsAlone'] + added_cols

let's improve our architecture

this time let's try the pytorch way

In [4]:
class SimpleNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleNN, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.bn1 = torch.nn.BatchNorm1d(hidden_size)
        self.fc2 = torch.nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = torch.sigmoid(self.fc2(x))
        return x

def train_model(model, X_train, y_train, X_val, y_val, epochs=300, lr=0.01, batch_size=32):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)  # L2 regularization
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)  # Step LR schedule
    
    best_val_loss = float('inf')
    patience = 20
    counter = 0
    
    for epoch in range(epochs):
        model.train()
        for i in range(0, len(X_train), batch_size):
            batch_X = X_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = F.binary_cross_entropy(outputs, batch_y.unsqueeze(1))
            loss.backward()
            optimizer.step()
        
        scheduler.step()
        
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = F.binary_cross_entropy(val_outputs, y_val.unsqueeze(1))
            
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()
            counter = 0
        else:
            counter += 1
        
        if counter > patience:
            print(f"Early stopping at epoch {epoch}")
            break
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch}: Train Loss {loss.item():.4f}, Val Loss {val_loss.item():.4f}")
    
    model.load_state_dict(best_model)
    return model

that's it for our arch, let's test it to see how far it will go

In [5]:
# Prepare the data
scaler = StandardScaler()
X = scaler.fit_transform(trn_df[indep_cols].values)
y = trn_df['Survived'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_val = torch.FloatTensor(X_val)
y_val = torch.FloatTensor(y_val)

# Initialize and train the model
input_size = X_train.shape[1]
hidden_size = 10
model = SimpleNN(input_size, hidden_size)

trained_model = train_model(model, X_train, y_train, X_val, y_val, epochs=50, lr=0.02)

# Evaluate the model
model.eval()
with torch.no_grad():
    val_outputs = model(X_val)
    val_preds = (val_outputs > 0.5).float()
    accuracy = (val_preds.squeeze() == y_val).float().mean()
    print(f"Validation Accuracy: {accuracy.item():.4f}")

Epoch 0: Train Loss 0.3662, Val Loss 0.4901
Epoch 10: Train Loss 0.2244, Val Loss 0.4254
Epoch 20: Train Loss 0.2155, Val Loss 0.4271
Early stopping at epoch 29
Validation Accuracy: 0.8268


alright, let's make it

In [6]:
## Load and preprocess test data
test_df = pd.read_csv(path / "test.csv")
test_df = preprocess_data(test_df)

for col in indep_cols:
    if col not in test_df.columns:
        test_df[col] = 0  

X_test = test_df[indep_cols].values
X_test = scaler.transform(X_test)
X_test = torch.FloatTensor(X_test)

make predictions on testset

In [7]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_preds = (test_outputs > 0.5).int()
test_df["Survived"] = test_preds

In [8]:
sub_df = test_df[['PassengerId', 'Survived']]
print(sub_df["Survived"].sum())
print(sub_df["Survived"].value_counts())

149
Survived
0    269
1    149
Name: count, dtype: int64


In [9]:
sub_df.to_csv("sub.csv", index=False)

In [10]:
!head sub.csv

PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1


this time i hit 0.7871 acc, which is i think pretty good for titanic comp(don't count those who use extra data here)

alright, see ya, happy codding, btw!