# Feature Engineering on the **Titanic dataset**.

In [64]:
import pandas as pd
import numpy as np

df = pd.read_csv("data_titanic_kaggle.csv")
df.head()

Unnamed: 0,passengerId,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


| Column          | Meaning                                                              |
| --------------- | -------------------------------------------------------------------- |
| **PassengerId** | Unique ID assigned to each passenger                                 |
| **Survived**    | Survival outcome (0 = No, 1 = Yes)                                   |
| **Pclass**      | Passenger class (1 = 1st, 2 = 2nd, 3 = 3rd)                          |
| **Name**        | Full name of the passenger (often includes title)                    |
| **Sex**         | Gender of the passenger                                              |
| **Age**         | Age in years (some values missing)                                   |
| **SibSp**       | Number of siblings or spouses aboard                                 |
| **Parch**       | Number of parents or children aboard                                 |
| **Ticket**      | Ticket number                                                        |
| **Fare**        | Amount paid for the ticket                                           |
| **Cabin**       | Cabin number (many missing values)                                   |
| **Embarked**    | Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton) |


# Initial Data Inspection

In [65]:
print(df.shape) # rows, col

(891, 12)


# Missing values

In [67]:
# 1) How many missing values ?

missing = df.isna().sum()
print(missing)

passengerId      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64


In [68]:
### AGE → Median (robust to outliers)
median = df["age"].median()

df["age"] = df["age"].fillna(median)

In [69]:
### EMBARK_TOWN → Mode

df["embarked"] = df["embarked"].fillna(
    df["embarked"].mode()[0]
)

In [70]:
missing = df.isna().sum()
print(missing)

passengerId      0
survived         0
pclass           0
name             0
sex              0
age              0
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         0
dtype: int64


In [71]:
### NOTE: Generally we DROP columns with too many missing values: deck was missing >50% percent
## NOTE: Here we would not drop this column. We can use missing info to gain some insight

# df = df.drop(columns=["cabin"])

## Feature from Domain Knowledge

### **Family Size**

**Why?**
Passengers traveling with family behaved differently during evacuation.


In [72]:
df['family_size'] = df['sibsp'] + df['parch'] + 1

#### NOTE: 
* `+1` includes the passenger
* Size = 1 → traveling alone

---

### **Is Alone (Binary Feature)**

In [73]:
df['is_alone'] = (df['family_size'] == 1).astype(int) # 0 means not alone

print(df.sample(5))

     passengerId  survived  pclass                       name     sex   age  \
708          709         1       1       Cleaver, Miss. Alice  female  22.0   
847          848         0       3         Markoff, Mr. Marin    male  35.0   
258          259         1       1           Ward, Miss. Anna  female  35.0   
477          478         0       3  Braund, Mr. Lewis Richard    male  29.0   
63            64         0       3      Skoog, Master. Harald    male   4.0   

     sibsp  parch    ticket      fare cabin embarked  family_size  is_alone  
708      0      0    113781  151.5500   NaN        S            1         1  
847      0      0    349213    7.8958   NaN        C            1         1  
258      0      0  PC 17755  512.3292   NaN        C            1         1  
477      1      0      3460    7.0458   NaN        S            2         0  
63       3      2    347088   27.9000   NaN        S            6         0  


#### Note:
* Convert logic → numeric
* Binary features are ML-friendly


## Feature from Text

### **Title from Name**

**Why?**
- Titles encode **gender, age, and social status**.


In [96]:
print(df['name'].sample(7))

567    Palsson, Mrs. Nils (Alma Cornelia Berglund)
345                  Brown, Miss. Amelia "Mildred"
563                              Simmons, Mr. John
449                 Peuchen, Major. Arthur Godfrey
20                            Fynney, Mr. Joseph J
595                    Van Impe, Mr. Jean Baptiste
108                                Rekic, Mr. Tido
Name: name, dtype: object


In [74]:
df['title'] = df['name'].str.extract(r',\s*([^\.]+)\.')
df['title'] = df['title'].str.strip()

# Check:
print(df['title'].value_counts())

title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64


---

### **Group Rare Titles**


In [75]:
rare_titles = [
    'Dr', 'Rev', 'Major', 'Col', 'Capt', 'Don',
    'Sir', 'Lady', 'the Countess', 'Jonkheer'
]

df.loc[df['title'].isin(rare_titles), 'title'] = 'Rare'

Why ?

> Reduce noise by grouping rare categories.

---


In [76]:
print(df.sample(5))

     passengerId  survived  pclass                        name     sex   age  \
135          136         0       2          Richard, Mr. Emile    male  23.0   
143          144         0       3         Burke, Mr. Jeremiah    male  19.0   
725          726         0       3         Oreskovic, Mr. Luka    male  20.0   
639          640         0       3  Thorneycroft, Mr. Percival    male  28.0   
11            12         1       1    Bonnell, Miss. Elizabeth  female  58.0   

     sibsp  parch         ticket     fare cabin embarked  family_size  \
135      0      0  SC/PARIS 2133  15.0458   NaN        C            1   
143      0      0         365222   6.7500   NaN        Q            1   
725      0      0         315094   8.6625   NaN        S            1   
639      1      0         376564  16.1000   NaN        S            2   
11       0      0         113783  26.5500  C103        S            1   

     is_alone title  
135         1    Mr  
143         1    Mr  
725         1 

### Create surnames

In [77]:
df['Surname'] = df['name'].str.split(',').str[0]
print(df['Surname'].nunique())

667


In [78]:
print(df.sample(10))

     passengerId  survived  pclass  \
43            44         1       2   
376          377         1       3   
202          203         0       3   
99           100         0       2   
161          162         1       2   
810          811         0       3   
719          720         0       3   
510          511         1       3   
311          312         1       1   
180          181         0       3   

                                                  name     sex   age  sibsp  \
43            Laroche, Miss. Simonne Marie Anne Andree  female   3.0      1   
376                    Landergren, Miss. Aurora Adelia  female  22.0      0   
202                         Johanson, Mr. Jakob Alfred    male  34.0      0   
99                                   Kantor, Mr. Sinai    male  34.0      1   
161  Watt, Mrs. James (Elizabeth "Bessie" Inglis Mi...  female  40.0      0   
810                             Alexander, Mr. William    male  26.0      0   
719                       Jo

## Feature from Numeric Transformation

### **Age Groups (Binning)**

**Why?**
- Models often learn better from age categories.


In [79]:
df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 12, 18, 60, 100],
    labels=['Child', 'Teen', 'Adult', 'Senior'],
    include_lowest=True
)


#### NOTE:

* Continuous → categorical
* Domain-based bins

---

In [80]:
print(df.sample(5))

     passengerId  survived  pclass                           name     sex  \
5              6         0       3               Moran, Mr. James    male   
590          591         0       3           Rintamaki, Mr. Matti    male   
124          125         0       1    White, Mr. Percival Wayland    male   
642          643         0       3  Skoog, Miss. Margit Elizabeth  female   
853          854         1       1      Lines, Miss. Mary Conover  female   

      age  sibsp  parch             ticket     fare cabin embarked  \
5    28.0      0      0             330877   8.4583   NaN        Q   
590  35.0      0      0  STON/O 2. 3101273   7.1250   NaN        S   
124  54.0      0      1              35281  77.2875   D26        S   
642   2.0      3      2             347088  27.9000   NaN        S   
853  16.0      0      1           PC 17592  39.4000   D28        S   

     family_size  is_alone title    Surname age_group  
5              1         1    Mr      Moran     Adult  
590 

## Feature from Boolean Logic

### **Child Indicator**

In [81]:
df['is_child'] = (df['age'] < 12).astype(int)

### NOTE:

* Simple rule-based feature
* Often very predictive
* 0 means Not child and 1 means person is a child


In [82]:
print(df.sample(5))

     passengerId  survived  pclass                           name     sex  \
563          564         0       3              Simmons, Mr. John    male   
658          659         0       2   Eitemiller, Mr. George Floyd    male   
517          518         0       3              Ryan, Mr. Patrick    male   
249          250         0       2  Carter, Rev. Ernest Courtenay    male   
177          178         0       1     Isham, Miss. Ann Elizabeth  female   

      age  sibsp  parch           ticket     fare cabin embarked  family_size  \
563  28.0      0      0  SOTON/OQ 392082   8.0500   NaN        S            1   
658  23.0      0      0            29751  13.0000   NaN        S            1   
517  28.0      0      0           371110  24.1500   NaN        Q            1   
249  54.0      1      0           244252  26.0000   NaN        S            2   
177  50.0      0      0         PC 17595  28.7125   C49        C            1   

     is_alone title     Surname age_group  is_chil

---

## Feature from Existing Categorical

### **Cabin Presence**
Instead of using raw `cabin` (too many missing), create another column


In [83]:
df['has_cabin'] = df['cabin'].notna().astype(int)

#### NOTE

> Missing itself can be information.

- 0 means that passenger does NOT have cabin and 1 means that passenger has cabin 

In [84]:
print(df.sample(5))

     passengerId  survived  pclass  \
614          615         0       3   
883          884         0       2   
582          583         0       2   
583          584         0       1   
427          428         1       2   

                                                  name     sex   age  sibsp  \
614                    Brocklebank, Mr. William Alfred    male  35.0      0   
883                      Banfield, Mr. Frederick James    male  28.0      0   
582                         Downton, Mr. William James    male  54.0      0   
583                                Ross, Mr. John Hugo    male  36.0      0   
427  Phillips, Miss. Kate Florence ("Mrs Kate Louis...  female  19.0      0   

     parch            ticket    fare cabin embarked  family_size  is_alone  \
614      0            364512   8.050   NaN        S            1         1   
883      0  C.A./SOTON 34068  10.500   NaN        S            1         1   
582      0             28403  26.000   NaN        S           

---

## Feature from Fare

### **Fare per Person**


In [85]:
df['fare_per_person'] = df['fare'] / df['family_size']

#### NOTE:

* Normalizes fare
* Removes bias from group tickets

In [86]:
print(df.sample(5))

     passengerId  survived  pclass  \
610          611         0       3   
644          645         1       3   
658          659         0       2   
772          773         0       2   
2              3         1       3   

                                                  name     sex    age  sibsp  \
610  Andersson, Mrs. Anders Johan (Alfrida Konstant...  female  39.00      1   
644                             Baclini, Miss. Eugenie  female   0.75      2   
658                       Eitemiller, Mr. George Floyd    male  23.00      0   
772                                  Mack, Mrs. (Mary)  female  57.00      0   
2                               Heikkinen, Miss. Laina  female  26.00      0   

     parch            ticket     fare cabin embarked  family_size  is_alone  \
610      5            347082  31.2750   NaN        S            7         0   
644      1              2666  19.2583   NaN        C            4         0   
658      0             29751  13.0000   NaN        S 

---

## Feature from Embarked

###  **One-Hot Encoding**


In [87]:
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

#### Creates:

```
embarked_Q, embarked_S
```
- One-hot encoding needs k−1 columns for k categories

#### NOTE:

* Convert categories → numbers
* Avoid dummy trap



In [88]:
print(df.sample(5))

     passengerId  survived  pclass  \
165          166         1       3   
93            94         0       3   
547          548         1       2   
833          834         0       3   
289          290         1       3   

                                                name     sex   age  sibsp  \
165  Goldsmith, Master. Frank John William "Frankie"    male   9.0      0   
93                           Dean, Mr. Bertram Frank    male  26.0      1   
547                       Padro y Manent, Mr. Julian    male  28.0      0   
833                           Augustsson, Mr. Albert    male  23.0      0   
289                             Connolly, Miss. Kate  female  22.0      0   

     parch         ticket     fare  ... family_size  is_alone   title  \
165      2         363291  20.5250  ...           3         0  Master   
93       2      C.A. 2315  20.5750  ...           4         0      Mr   
547      0  SC/PARIS 2146  13.8625  ...           1         1      Mr   
833      0      

---

## Feature Cleanup

In [89]:
df['sex'] = df['sex'].map({'male': 0, 'female': 1})

#### NOTE:

* Explicit mapping
* More readable than auto-encoding


In [90]:
print(df.sample(5))

     passengerId  survived  pclass  \
31            32         1       1   
766          767         0       1   
349          350         0       3   
886          887         0       2   
799          800         0       3   

                                                  name  sex   age  sibsp  \
31      Spencer, Mrs. William Augustus (Marie Eugenie)    1  28.0      1   
766                          Brewe, Dr. Arthur Jackson    0  28.0      0   
349                                   Dimic, Mr. Jovan    0  42.0      0   
886                              Montvila, Rev. Juozas    0  27.0      0   
799  Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...    1  30.0      1   

     parch    ticket      fare  ... family_size  is_alone  title   Surname  \
31       0  PC 17569  146.5208  ...           2         0    Mrs   Spencer   
766      0    112379   39.6000  ...           1         1   Rare     Brewe   
349      0    315088    8.6625  ...           1         1     Mr     Dimic   
88

# STOP



---

#  Final Feature Engineering Summary

| Feature         | Type        | Concept Taught   |
| --------------- | ----------- | ---------------- |
| family_size     | Numeric     | Domain knowledge |
| is_alone        | Binary      | Logical features |
| title           | Categorical | Text extraction  |
| Rare title      | Categorical | Noise reduction  |
| age_group       | Categorical | Binning          |
| is_child        | Binary      | Rule-based       |
| has_cabin       | Binary      | Missing as info  |
| fare_per_person | Numeric     | Normalization    |
| embarked_*      | Numeric     | Encoding         |



In [91]:
# 1) Extract title from names

# df['title'] = df['name'].str.extract(r', (\w+)\.')  # extract text between ',' and '.'
# print(df['title'].value_counts())

title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Mlle          2
Col           2
Don           1
Mme           1
Ms            1
Lady          1
Sir           1
Capt          1
Jonkheer      1
Name: count, dtype: int64


In [92]:
# 2) Create rare title

rare_titles = ['Rev', 'Dr', 'Major', 'Col', 'Sir', 'Lady', 'Countess', 'Capt', 'Don', 'Jonkheer']
df['title'] = df['title'].replace(rare_titles, 'RareTitle')


In [93]:
# 3) Some anomalies in Sex can be cross-checked using Title: No anamolies
df[df['title'].isin(['Master', 'Mr']) & (df['sex'] != 'male')]


Unnamed: 0,passengerId,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,...,family_size,is_alone,title,Surname,age_group,is_child,has_cabin,fare_per_person,embarked_Q,embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,...,2,0,Mr,Braund,Adult,0,0,3.6250,False,True
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,...,1,1,Mr,Allen,Adult,0,0,8.0500,False,True
5,6,0,3,"Moran, Mr. James",0,28.0,0,0,330877,8.4583,...,1,1,Mr,Moran,Adult,0,0,8.4583,True,False
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,...,1,1,Mr,McCarthy,Adult,0,1,51.8625,False,True
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.0750,...,5,0,Master,Palsson,Child,1,0,4.2150,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,882,0,3,"Markun, Mr. Johann",0,33.0,0,0,349257,7.8958,...,1,1,Mr,Markun,Adult,0,0,7.8958,False,True
883,884,0,2,"Banfield, Mr. Frederick James",0,28.0,0,0,C.A./SOTON 34068,10.5000,...,1,1,Mr,Banfield,Adult,0,0,10.5000,False,True
884,885,0,3,"Sutehall, Mr. Henry Jr",0,25.0,0,0,SOTON/OQ 392076,7.0500,...,1,1,Mr,Sutehall,Adult,0,0,7.0500,False,True
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,...,1,1,Mr,Behr,Adult,0,1,30.0000,False,False


In [94]:
# Lets create column family size

# df['familysize'] = df['sibsp'] + df['parch'] + 1
# df['is_alone'] = 1  # default
# df.loc[df['familysize'] > 1, 'is_alone'] = 0