In [2]:
# Importing the necessary libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

## Load the dataset into Python environment

In [4]:
df = pd.read_csv('titanic_dataset.csv')

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Make ‘PassengerId’ as the index column


In [6]:
df.set_index('PassengerId', inplace=True)

In [7]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Check the basic details of the dataset

In [9]:
print(df.head()) # Print the first few rows of the dataset

             Survived  Pclass  \
PassengerId                     
1                   0       3   
2                   1       1   
3                   1       3   
4                   1       1   
5                   0       3   

                                                          Name     Sex   Age  \
PassengerId                                                                    
1                                      Braund, Mr. Owen Harris    male  22.0   
2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
3                                       Heikkinen, Miss. Laina  female  26.0   
4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
5                                     Allen, Mr. William Henry    male  35.0   

             SibSp  Parch            Ticket     Fare Cabin Embarked  
PassengerId                                                          
1                1      0         A/5 21171   7.2500   NaN        S

In [11]:
print(df.info()) # Display information about the dataset (e.g., column names,␣↪data types)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None


In [12]:
df.shape

(891, 11)

## Fill in all the missing values present in all the columns in the dataset

In [13]:
df.fillna(df.mean(), inplace=True) # Fill missing values with the column means

  df.fillna(df.mean(), inplace=True) # Fill missing values with the column means


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


## Check and handle outliers in at least 3 columns in the dataset

In [15]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [16]:
outlier_cols = ['Age', 'Fare', 'SibSp', 'Parch']

In [19]:
for col in outlier_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = df[col].clip(lower_bound, upper_bound) # Clip outliers to the␣↪upper and lower bounds

In [20]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.376817,0.426487,0.0,24.046813
std,0.486592,0.836071,12.062035,0.708246,0.0,20.481625
min,0.0,1.0,2.5,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,54.5,2.5,0.0,65.6344


## Do min-max scaling on the feature set (Take ‘Survived’ as the target)

In [21]:
scaler = MinMaxScaler()
features = df[['Age', 'Fare', 'SibSp', 'Parch']] # Feature selection without␣↪the target column
scaled_features = scaler.fit_transform(features) # Perform min-max scaling on␣ ↪the features

In [25]:
scaled_dataset = pd.DataFrame(scaled_features, columns=features.columns)
scaled_dataset[['Survived','Pclass','Name','Sex','Ticket','Embarked']] = df[['Survived','Pclass','Name','Sex','Ticket','Embarked']]

In [26]:
scaled_dataset.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,Pclass,Name,Sex,Ticket,Embarked
0,0.375,0.11046,0.4,0.0,,,,,,
1,0.682692,1.0,0.4,0.0,0.0,3.0,"Braund, Mr. Owen Harris",male,A/5 21171,S
2,0.451923,0.120745,0.0,0.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C
3,0.625,0.809027,0.4,0.0,1.0,3.0,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,S
4,0.625,0.122649,0.0,0.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,S
