In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Titanic-Dataset.csv')

In [3]:
"""1. Inspect & Explore
Get top/bottom 5 rows (.head(), .tail())

Get shape, columns, and data types

Check missing values (.isna().sum())

2. Filtering & Selecting
Select only Name and Age

Filter passengers older than 50

Find all female passengers in Pclass 1 who survived

3. Aggregations
Average age by class (groupby('Pclass')['Age'].mean())

Count survivors by gender

Highest fare per class

4. Data Cleaning
Fill missing Age with median

Fill missing Embarked with mode

Drop Cabin (too many missing values)

5. Feature Engineering
Create FamilySize = SibSp + Parch + 1

Create Title by extracting it from Name (e.g., Mr., Mrs., Miss)

6. Sorting
Sort by Fare (descending)

Sort by Age (ascending)

7. Encoding & Transformation
Convert Sex to numeric (male=0, female=1)

One-hot encode Embarked

8. Advanced Grouping
Pivot: Survival rate by Sex & Pclass

Groupby multiple columns (groupby(['Sex', 'Pclass'])['Survived'].mean())

9. Export
Save the cleaned dataset as titanic_cleaned.csv"""

"1. Inspect & Explore\nGet top/bottom 5 rows (.head(), .tail())\n\nGet shape, columns, and data types\n\nCheck missing values (.isna().sum())\n\n2. Filtering & Selecting\nSelect only Name and Age\n\nFilter passengers older than 50\n\nFind all female passengers in Pclass 1 who survived\n\n3. Aggregations\nAverage age by class (groupby('Pclass')['Age'].mean())\n\nCount survivors by gender\n\nHighest fare per class\n\n4. Data Cleaning\nFill missing Age with median\n\nFill missing Embarked with mode\n\nDrop Cabin (too many missing values)\n\n5. Feature Engineering\nCreate FamilySize = SibSp + Parch + 1\n\nCreate Title by extracting it from Name (e.g., Mr., Mrs., Miss)\n\n6. Sorting\nSort by Fare (descending)\n\nSort by Age (ascending)\n\n7. Encoding & Transformation\nConvert Sex to numeric (male=0, female=1)\n\nOne-hot encode Embarked\n\n8. Advanced Grouping\nPivot: Survival rate by Sex & Pclass\n\nGroupby multiple columns (groupby(['Sex', 'Pclass'])['Survived'].mean())\n\n9. Export\nSave 

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [6]:
df.shape

(891, 12)

In [7]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [9]:
df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [10]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [11]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [12]:
df.shape

(891, 12)

In [13]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [14]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [15]:
"Select only the columns 'Name', 'Sex', and 'Age'."

df[['Name', 'Sex', 'Age']]

Unnamed: 0,Name,Sex,Age
0,"Braund, Mr. Owen Harris",male,22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0
2,"Heikkinen, Miss. Laina",female,26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0
4,"Allen, Mr. William Henry",male,35.0
...,...,...,...
886,"Montvila, Rev. Juozas",male,27.0
887,"Graham, Miss. Margaret Edith",female,19.0
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,
889,"Behr, Mr. Karl Howell",male,26.0


In [16]:
df[df["Age"]>50]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5000,,S
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C
...,...,...,...,...,...,...,...,...,...,...,...,...
820,821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5000,B69,S
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0000,B28,
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.5500,E17,S


In [17]:
df[(df['Pclass']==1) & (df["Sex"] == 'female') & (df["Survived"] == 1)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
...,...,...,...,...,...,...,...,...,...,...,...,...
856,857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S
862,863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Ba...",female,48.0,0,0,17466,25.9292,D17,S
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C


Find all female passengers in Pclass 1 who survived.

In [18]:
df.groupby('Pclass')['Age'].mean()

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [19]:
'Count the number of survivors by gender.'

df.groupby('Sex')['Survived'].sum()

Sex
female    233
male      109
Name: Survived, dtype: int64

In [20]:
'Find the maximum and minimum fare paid by any passenger.'

df['Fare'].agg(['min', 'max'])

min      0.0000
max    512.3292
Name: Fare, dtype: float64

In [21]:
df['Fare'].min()

np.float64(0.0)

In [22]:
df.agg({
    'Fare': ['min', 'max', 'mean'],
    'Age' : ['min', 'max', 'mean'] 
})

Unnamed: 0,Fare,Age
min,0.0,0.42
max,512.3292,80.0
mean,32.204208,29.699118


In [23]:
'Fill missing age values with the median age.'

df['Age'].fillna(df['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [24]:
'Fill missing Embarked values with the most common value (mode).'

df['Embarked'].fillna(df['Embarked'].mode(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode(), inplace=True)


In [25]:
'Drop the Cabin column due to too many missing values.'

df.drop('Cabin', axis=1, inplace=True)

In [26]:
"Create a new column 'FamilySize' as SibSp + Parch + 1."

df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [27]:
"""Extract titles (Mr, Mrs, Miss, etc.) from the Name column into a new column 'Title'."""

df["Name"].str.extract('([A-Za-z]+)\.')
df.head()

  df["Name"].str.extract('([A-Za-z]+)\.')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1


In [28]:
df.sort_values(by = 'Fare', ascending=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,C,1
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,C,1
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,C,2
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0000,S,6
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0000,S,6
...,...,...,...,...,...,...,...,...,...,...,...,...
633,634,0,1,"Parr, Mr. William Henry Marsh",male,28.0,0,0,112052,0.0000,S,1
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,28.0,0,0,239853,0.0000,S,1
822,823,0,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0000,S,1
732,733,0,2,"Knight, Mr. Robert J",male,28.0,0,0,239855,0.0000,S,1


In [29]:
df['Sex'] = df['Sex'].map({'male':1, 'female': 2})

In [30]:
df = pd.get_dummies(df, columns=['Embarked'])

In [31]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'FamilySize', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [32]:
pd.pivot_table(df, values='Survived', index='Sex', columns='Pclass', aggfunc='mean', margins=True, margins_name='Total')

Pclass,1,2,3,Total
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.368852,0.157407,0.135447,0.188908
2,0.968085,0.921053,0.5,0.742038
Total,0.62963,0.472826,0.242363,0.383838


In [33]:
df.groupby(['Sex', 'Pclass'])['Survived'].mean()

Sex  Pclass
1    1         0.368852
     2         0.157407
     3         0.135447
2    1         0.968085
     2         0.921053
     3         0.500000
Name: Survived, dtype: float64

In [34]:
df.to_csv('Titanic_cleaned.csv')