https://www.kaggle.com/competitions/titanic/overview

# Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense

# Checking the data

In [None]:
df = pd.read_csv('train.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [None]:
print(df["Survived"].value_counts())
total = 549 + 342
class_0_ratio = (549 / total) * 100
class_1_ratio = (342 / total) * 100

print(f"Class 0 (Not Survived): {class_0_ratio:.2f}%")
print(f"Class 1 (Survived): {class_1_ratio:.2f}%")

Survived
0    549
1    342
Name: count, dtype: int64
Class 0 (Not Survived): 61.62%
Class 1 (Survived): 38.38%


In [None]:
df['Title'] = df['Name'].str.split(",").str[1].str.split().str[0]
print(df['Title'].unique())

['Mr.' 'Mrs.' 'Miss.' 'Master.' 'Don.' 'Rev.' 'Dr.' 'Mme.' 'Ms.' 'Major.'
 'Lady.' 'Sir.' 'Mlle.' 'Col.' 'Capt.' 'the' 'Jonkheer.']


In [None]:
print(df.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64


In [None]:
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  Title  
0      0         A/5 21171   7.2500   NaN        S    Mr.  
1      0          PC 17599  71.2833   C85        C   Mrs.  
2      0  STON/O2. 3101282   7.9250   NaN        S  Miss.  
3      0            113803  53.1000  C123        S   Mrs.  
4      0            

In [None]:
print(df.columns.tolist())

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title']


In [None]:
print(df.shape)

(891, 13)


In [None]:
print(df.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [None]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64


# Data Preprocessing

In [None]:
def map_title(title):
    if title in ["Mr.", "Don.", "Rev.", "Dr.", "Major.", "Sir.", "Col.", "Capt.", "the", "Jonkheer."]:
        return "Mr."
    elif title in ["Mrs.", "Mme.", "Lady."]:
        return "Mrs."
    elif title in ["Miss.", "Ms.", "Mlle."]:
        return "Miss."
    elif title in ["Master."]:
        return "Master."
    else:
        return title

In [None]:
def preprocess_data(df):


    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    df['Title'] = df['Name'].str.split(",").str[1].str.split().str[0]
    df["Mapped_Title"] = df["Title"].apply(map_title)
    df["Age"] = df["Age"].fillna(df.groupby("Mapped_Title")["Age"].transform("mean"))
    df["Mapped_Title"] = df["Mapped_Title"].map({"Mr.": 0, "Mrs.": 1, "Miss.": 2, "Master.": 3,"Dr.":4})

    df = df.drop(['SibSp', 'Parch', 'Cabin'], axis=1)

    df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

    df['Fare'] = df['Fare'].fillna(0)

    df['AgeBand'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, np.inf], labels=[0, 1, 2, 3, 4]).astype(int)

    df['FareBand'] = pd.qcut(df['Fare'], q=4, labels=[0, 1, 2, 3]).astype(int)

    df['Fare_log'] = np.log1p(df['Fare'])

    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()

    df = df.drop(['Name', 'Ticket', 'Embarked','Title', 'Fare'], axis=1)

    return df

# Model

In [None]:
df = df.set_index('PassengerId')
X = df.drop(['Survived'],axis=1)
y = df['Survived']

X = preprocess_data(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = keras.Sequential([
    Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=8, class_weight={0: 0.80, 1: 1.33})
val_loss, val_acc = model.evaluate(X_test, y_test)

Epoch 1/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5312 - loss: 0.6862 - val_accuracy: 0.7654 - val_loss: 0.6096
Epoch 2/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7948 - loss: 0.5960 - val_accuracy: 0.7933 - val_loss: 0.5104
Epoch 3/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7844 - loss: 0.5439 - val_accuracy: 0.7709 - val_loss: 0.4724
Epoch 4/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7466 - loss: 0.5308 - val_accuracy: 0.7654 - val_loss: 0.4538
Epoch 5/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7907 - loss: 0.4658 - val_accuracy: 0.7654 - val_loss: 0.4475
Epoch 6/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8005 - loss: 0.4620 - val_accuracy: 0.7598 - val_loss: 0.4445
Epoch 7/50
[1m89/89[0m [32m━━━━━━━━━━

In [None]:
print(f"Validation Accuracy: {val_acc:.4f}")

Validation Accuracy: 0.8156


In [None]:
test_data = pd.read_csv('test.csv').set_index('PassengerId')
test_features = preprocess_data(test_data)
predictions = (model.predict(test_features) > 0.5).astype(int)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8316 - loss: 0.4175 
Test Loss: 0.4391
Test Accuracy: 0.8156


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88       105
           1       0.82      0.84      0.83        74

    accuracy                           0.85       179
   macro avg       0.85      0.85      0.85       179
weighted avg       0.86      0.85      0.86       179



# Submission

In [None]:
pred = predictions.flatten()
submission = pd.DataFrame({
    'PassengerId': test_features.index,
    'Survived': pred
}).fillna(0)
submission.to_csv('submission.csv', index=False, sep=',')