In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

LOAD DATA

In [2]:
train_file = pd.read_csv('./train.csv')
test_file = pd.read_csv('./test.csv')

ANALYZE DATA

In [3]:
total_data = pd.concat([train_file,test_file],axis=0) #Concatenate for efficient feature engineering.
total_data.reset_index(inplace=True)
total_data = total_data.drop(columns=['index','PassengerId'])


In [4]:
total_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
total_data["Cabin"].unique()
X_test['Deck'] = X_test['Cabin'].str.extract('([A-Za-z])')
X_test.drop(columns=['Cabin'], inplace=True)

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [4]:
# title_col = pd.DataFrame([i.split(',')[1].split('.')[0].strip() for i in total_data.Name],columns=['Title'])

extract = lambda n : n.split(',')[1].split('.')[0].strip()

total_data['Title']  = total_data['Name'].map(extract)

In [None]:
total_data.Title.value_counts()

In [5]:
title_dict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Royalty",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

total_data['Title']= total_data.Title.map(title_dict)

In [None]:
total_data.Title.value_counts()

In [None]:
plt.hist([total_data[total_data['Survived'] == 1]['Fare'], total_data[total_data['Survived'] == 0]['Fare']],color=['b','r'], label=['Survived','Dead'])
plt.legend()

In [6]:
categorical_features = ["Sex","Embarked","Title"] # ["Sex","Ticket","Cabin","Embarked"]
numerical_features = ['Age','Pclass', 'SibSp', 'Parch', 'Fare']

VISUALIZE and MANIPULATE DATA

In [None]:
#BOX PLOT, SCATTER PLOT etc

In [7]:
class Transforms():
    def __init__(self, categorical_features : list[str], numerical_features : list[str]) -> None:
        self.cat = categorical_features
        self.num = numerical_features
        self.imputer = SimpleImputer(strategy='mean')
        self.imputer_cat = SimpleImputer(strategy='most_frequent') 
        self.scaler = StandardScaler()
        self.encoder = OneHotEncoder(handle_unknown='ignore',min_frequency=5)

    
    def __call__(self, file : pd.DataFrame):
        file[self.num] = self.imputer.fit_transform(file[self.num])
        file[self.num] = self.scaler.fit_transform(file[self.num])
        file[self.cat] = self.imputer_cat.fit_transform(file[self.cat])
        encoded = self.encoder.fit_transform(file[self.cat])
        col_names = self.encoder.get_feature_names_out(self.cat)
        encoded = pd.DataFrame(encoded.toarray(),columns=col_names)
        features = pd.concat([file, encoded], axis=1).drop(self.cat, axis=1)
        
        return features

class CustomDataset(Dataset):
    def __init__(self,features : pd.DataFrame ,transform = None, training= True, **kwargs) -> None:  #labels : pd.Series
        super().__init__()
        self.features = features
        if training:
            self.labels = kwargs.get('labels',None)
        self.transform = transform
        self.training = training
    
    def __getitem__(self, index):
        if self.training:
            sample = {'features': torch.tensor(self.features.values[index],dtype=torch.float32),'labels': torch.tensor(self.labels.iloc[index],dtype=torch.float32)}
        else:
            sample = {'features': torch.tensor(self.features.values[index],dtype=torch.float32)}
        
        return sample

    def __len__(self):
        return len(self.features)

In [15]:

transform = Transforms(categorical_features=categorical_features,numerical_features=numerical_features)

y_total = total_data["Survived"]
X_total = total_data.drop(columns=['Survived','Name'])

X_total = transform(X_total)

#TRAIN DATA
X = X_total[:891]
y = y_total[:891]

#TEST DATA
X_test = X_total[891:]
X_test.drop(columns=['Ticket','Cabin'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=['Ticket','Cabin'],inplace=True)


In [17]:
X_test.shape

(418, 16)

In [18]:
X = X.drop(columns=['Cabin','Ticket'])

In [19]:
X.shape

(891, 16)

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=35)

LOAD DATA

In [21]:
train_dataset = CustomDataset(X_train,  labels= y_train, transform=transform)
val_dataset = CustomDataset(X_val, labels = y_val, transform=transform)

# Use DataLoader for batching and shuffling
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True,drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [37]:
class TitanicModel(nn.Module):
    def __init__(self, input_size):
        super(TitanicModel, self).__init__()
        self.l1 = nn.Linear(input_size, 16)
        self.bn1 = nn.BatchNorm1d(16)  #Batchnorm best when applied before activation function
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(p=0.2)  #use dropout after activation  CONV / Dense -> BN -> ReLU -> Dropout
        self.l2 = nn.Linear(16,32)
        self.bn2 = nn.BatchNorm1d(32)
        self.f1= nn.Flatten()
        self.l3 = nn.Linear(32, 16)  # Output layer with 1 neuron for binary classification
        
        self.l4 = nn.Linear(16,1)

    def forward(self, x):
        x = self.l1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.l2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.f1(x)
        x = self.l3(x)
        x = self.relu(x)
        x = self.l4(x)
        
        return x


In [38]:
#Train Loop

input_size = X_train.shape[1]
model = TitanicModel(input_size)

loss = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model.parameters(),lr=1e-3)


epochs = 1000
for epoch in range(epochs):
    model.train()
    for item, train_set in enumerate(train_loader):
        optimizer.zero_grad()
        feature,label = train_set['features'],train_set['labels']
        output = model(feature)
        l = loss(output.squeeze(),label)

        l.backward()
        optimizer.step()
    # print(f'Epoch [{epoch + 1}/{epochs}], Loss: {l.item()}')

    if (epoch+1) % 100 == 0:
        model.eval()
        for item, val_set in enumerate(val_loader):
            v_feature,v_label = val_set['features'],val_set['labels']
            val_output = model(v_feature)
            v_l = loss(val_output.squeeze(),v_label)
        print(f'Val epoch : {epoch+1}, val loss : {v_l.item()}')
        

Val epoch : 100, val loss : 0.17704473435878754
Val epoch : 200, val loss : 0.18148790299892426
Val epoch : 300, val loss : 0.21065418422222137
Val epoch : 400, val loss : 0.22178776562213898
Val epoch : 500, val loss : 0.21202929317951202
Val epoch : 600, val loss : 0.19529074430465698
Val epoch : 700, val loss : 0.2454066276550293
Val epoch : 800, val loss : 0.21798807382583618
Val epoch : 900, val loss : 0.19890768826007843
Val epoch : 1000, val loss : 0.20251309871673584


In [39]:
test_dataset = CustomDataset(X_test,training=False)
test_loader = DataLoader(test_dataset,shuffle=False)

In [None]:
X_test

In [40]:
# test_data_transformed = transform(test_data)

# test_data_transformed = test_data_transformed.drop(columns=['Name','Ticket','Cabin','PassengerId'])



predictions = [] 
model.eval()
for i,item in enumerate(test_loader):
    op = model(item['features'])
    prob = torch.sigmoid(op)  #Calc probabilities
    # print(prob)
    binary_pred = (prob>0.5).int()
    predictions.append(binary_pred)

predictions = torch.cat(predictions,dim=0)
# print(predictions)

In [41]:
pass_id = test_file['PassengerId'].values
pass_id = pass_id.reshape(-1,1)
fin = np.concatenate([pass_id,predictions],axis=1)
fin = pd.DataFrame(fin,columns=['PassengerId','Survived'])
fin.to_csv(path_or_buf='submission_10.csv',index=False)