In [1]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
# Load the Titanic dataset
titanic = sns.load_dataset('titanic')

In [3]:
# Handling Missing Values
# Impute missing values in 'age' with the mean
imputer = SimpleImputer(strategy='mean')
titanic['age'] = imputer.fit_transform(titanic[['age']])

In [4]:
# Assume 'deck' has too many missing values and drop it
titanic.drop(columns=['deck'], inplace=True)

In [5]:
# Outlier Detection and Removal
# Detect and remove outliers in 'fare' based on the Interquartile Range (IQR)
Q1 = titanic['fare'].quantile(0.25)
Q3 = titanic['fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
titanic = titanic[(titanic['fare'] >= lower_bound) & (titanic['fare'] <= upper_bound)]


In [None]:
# Normalization
# Normalize 'fare' to have values between 0 and 1
scaler_min_max = MinMaxScaler()
titanic['fare_normalized'] = scaler_min_max.fit_transform(titanic[['fare']])

In [6]:
# Standardization
# Standardize 'age' to have a mean of 0 and a standard deviation of 1
scaler_std = StandardScaler()
titanic['age_standardized'] = scaler_std.fit_transform(titanic[['age']])

In [7]:
# Binning
# Transform 'age' into three discrete categories
titanic['age_binned'] = pd.cut(titanic['age'], bins=[0, 18, 60, 100], labels=["Child", "Adult", "Senior"])


In [15]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,embark_town,alive,alone,age_standardized,age_binned,family_size,sex_encoded,embarked_C,embarked_Q,embarked_S
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,Southampton,no,False,-0.556219,Adult,2,1,0.0,0.0,1.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,Southampton,yes,True,-0.243027,Adult,1,0,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,Southampton,yes,False,0.461654,Adult,2,0,0.0,0.0,1.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,Southampton,no,True,0.461654,Adult,1,1,0.0,1.0,0.0
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,...,Queenstown,no,True,0.046606,Adult,1,1,0.0,0.0,1.0


In [8]:
# Feature Engineering
# Create a new feature 'family_size' from 'sibsp' and 'parch'
titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1


In [12]:
# Feature Selection
# Select the top 3 features that have the highest correlation with 'survived'
X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare']]
y = titanic['survived']
selector = SelectKBest(score_func=chi2, k=3)
X_selected = selector.fit_transform(X, y)

In [14]:
# Encoding Categorical Variables
# Convert 'sex' into a numerical format using Label Encoding
label_encoder = LabelEncoder()
titanic['sex_encoded'] = label_encoder.fit_transform(titanic['sex'])

# Convert 'embarked' into binary columns using One-Hot Encoding
one_hot_encoder = OneHotEncoder()
encoded_embarked = one_hot_encoder.fit_transform(titanic[['embarked']]).toarray()
embarked_columns = one_hot_encoder.get_feature_names_out(['embarked'])
titanic = titanic.join(pd.DataFrame(encoded_embarked, columns=embarked_columns))


In [17]:
# Data Splitting
# Split the data into training and testing sets
X = titanic[['pclass', 'sex_encoded', 'age_standardized', 'sibsp', 'parch', 'fare', 'family_size']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, the dataset is ready for model training

In [18]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,embark_town,alive,alone,age_standardized,age_binned,family_size,sex_encoded,embarked_C,embarked_Q,embarked_S
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,Southampton,no,False,-0.556219,Adult,2,1,0.0,0.0,1.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,Southampton,yes,True,-0.243027,Adult,1,0,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,Southampton,yes,False,0.461654,Adult,2,0,0.0,0.0,1.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,Southampton,no,True,0.461654,Adult,1,1,0.0,1.0,0.0
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,...,Queenstown,no,True,0.046606,Adult,1,1,0.0,0.0,1.0
