# Load the dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
data = pd.read_csv("C:\\Users\\user\\Downloads\\titanic_dataset.csv")

# ‘PassengerId’ as the index column

In [2]:
data = data.set_index('PassengerId')

# basic details of the dataset

In [3]:
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# Fill in all the missing values present in all the columns in the dataset

In [4]:
#By using Dropping rows/columns with missing values
data = data.dropna()  # Drop rows with any missing values

# Check and handle outliers in at least 3 columns

In [5]:
from scipy import stats
# Calculate z-scores for 'Fare'
z_scores = stats.zscore(data['Fare'])
# Identify outliers based on a threshold (e.g., +/- 3 standard deviations)
outliers_idx = (z_scores < -3) | (z_scores > 3)
outliers = data[outliers_idx]
print(outliers)
# Handle outliers (e.g., capping, winsorizing, or removing)

             Survived  Pclass                                Name   Sex   Age  \
PassengerId                                                                     
680                 1       1  Cardeza, Mr. Thomas Drake Martinez  male  36.0   
738                 1       1              Lesurer, Mr. Gustave J  male  35.0   

             SibSp  Parch    Ticket      Fare        Cabin Embarked  
PassengerId                                                          
680              0      1  PC 17755  512.3292  B51 B53 B55        C  
738              0      0  PC 17755  512.3292         B101        C  


In [6]:
pip install numpy==1.24.3

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# min max scaling on the feature set

In [7]:
target = data['Survived']
numeric_columns = data.select_dtypes(exclude='object').drop('Survived', axis=1)
non_numeric_columns = data.select_dtypes(include='object')

#Min-Max Scaling on the numeric feature set
scaler = MinMaxScaler()
scaled_numeric_columns = pd.DataFrame(scaler.fit_transform(numeric_columns), columns=numeric_columns.columns, index=numeric_columns.index)

# Concatenate the scaled numeric features with non-numeric columns and the target
titanic_preprocessed = pd.concat([scaled_numeric_columns, non_numeric_columns, target], axis=1)

# Now, 'titanic_preprocessed' contains the preprocessed dataset

In [8]:
print(numeric_columns)

             Pclass   Age  SibSp  Parch     Fare
PassengerId                                     
2                 1  38.0      1      0  71.2833
4                 1  35.0      1      0  53.1000
7                 1  54.0      0      0  51.8625
11                3   4.0      1      1  16.7000
12                1  58.0      0      0  26.5500
...             ...   ...    ...    ...      ...
872               1  47.0      1      1  52.5542
873               1  33.0      0      0   5.0000
880               1  56.0      0      1  83.1583
888               1  19.0      0      0  30.0000
890               1  26.0      0      0  30.0000

[183 rows x 5 columns]


In [9]:
print(non_numeric_columns)

                                                          Name     Sex  \
PassengerId                                                              
2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
7                                      McCarthy, Mr. Timothy J    male   
11                             Sandstrom, Miss. Marguerite Rut  female   
12                                    Bonnell, Miss. Elizabeth  female   
...                                                        ...     ...   
872           Beckwith, Mrs. Richard Leonard (Sallie Monypeny)  female   
873                                   Carlsson, Mr. Frans Olof    male   
880              Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)  female   
888                               Graham, Miss. Margaret Edith  female   
890                                      Behr, Mr. Karl Howell    male   

               Ticket        Cabin Em

In [10]:
print(scaled_numeric_columns)

             Pclass       Age     SibSp  Parch      Fare
PassengerId                                             
2               0.0  0.468892  0.333333   0.00  0.139136
4               0.0  0.430956  0.333333   0.00  0.103644
7               0.0  0.671219  0.000000   0.00  0.101229
11              1.0  0.038948  0.333333   0.25  0.032596
12              0.0  0.721801  0.000000   0.00  0.051822
...             ...       ...       ...    ...       ...
872             0.0  0.582701  0.333333   0.25  0.102579
873             0.0  0.405665  0.000000   0.00  0.009759
880             0.0  0.696510  0.000000   0.25  0.162314
888             0.0  0.228629  0.000000   0.00  0.058556
890             0.0  0.317147  0.000000   0.00  0.058556

[183 rows x 5 columns]


In [11]:
print(titanic_preprocessed)

             Pclass       Age     SibSp  Parch      Fare  \
PassengerId                                                
2               0.0  0.468892  0.333333   0.00  0.139136   
4               0.0  0.430956  0.333333   0.00  0.103644   
7               0.0  0.671219  0.000000   0.00  0.101229   
11              1.0  0.038948  0.333333   0.25  0.032596   
12              0.0  0.721801  0.000000   0.00  0.051822   
...             ...       ...       ...    ...       ...   
872             0.0  0.582701  0.333333   0.25  0.102579   
873             0.0  0.405665  0.000000   0.00  0.009759   
880             0.0  0.696510  0.000000   0.25  0.162314   
888             0.0  0.228629  0.000000   0.00  0.058556   
890             0.0  0.317147  0.000000   0.00  0.058556   

                                                          Name     Sex  \
PassengerId                                                              
2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female 