In [1]:
import pandas as pd

# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


### Handle Missing Values

In [13]:
# Impute Age with median from train
age_median = train_df['Age'].median()
train_df['Age'] = train_df['Age'].fillna(age_median)
test_df['Age'] = test_df['Age'].fillna(age_median)

# Impute Embarked with mode from train
embarked_mode = train_df['Embarked'].mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)

# Impute Fare in test with median from train
fare_median = train_df['Fare'].median()
test_df['Fare'] = test_df['Fare'].fillna(fare_median)

# Drop Cabin in both
train_df.drop('Cabin', axis=1, inplace=True, errors='ignore')
test_df.drop('Cabin', axis=1, inplace=True, errors='ignore')


In [15]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name    

### Handle Outliers

In [16]:
fare_cap = train_df['Fare'].quantile(0.95)
train_df['Fare'] = train_df['Fare'].clip(upper=fare_cap)
test_df['Fare'] = test_df['Fare'].clip(upper=fare_cap)

### Feature Engineering

In [18]:
# FamilySize
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# IsAlone
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

# Extract Title with raw strings to avoid escape sequence warnings
train_df['Title'] = train_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Simplify titles
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
    'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
    'Jonkheer': 'Rare', 'Don': 'Rare', 'Mme': 'Mrs', 'Capt': 'Rare',
    'Sir': 'Rare', 'Dona': 'Rare'
}
train_df['Title'] = train_df['Title'].map(title_mapping)
test_df['Title'] = test_df['Title'].map(title_mapping)

# Check for unmapped titles
print("Train unmapped titles:", train_df['Title'].isnull().sum())
print("Test unmapped titles:", test_df['Title'].isnull().sum())

Train unmapped titles: 0
Test unmapped titles: 0


In [23]:
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1,1,Mr


In [22]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,IsAlone,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,14.4542,Q,1,1,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,14.4542,S,2,0,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,14.4542,Q,1,1,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,14.4542,S,1,1,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,14.4542,S,3,0,Mrs


### Encode Categorical Variables

In [24]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['Sex', 'Embarked', 'Title']

# Fiting encoder on train
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_train = pd.DataFrame(
    encoder.fit_transform(train_df[categorical_cols]),
    columns=encoder.get_feature_names_out(categorical_cols),
    index=train_df.index
)
encoded_test = pd.DataFrame(
    encoder.transform(test_df[categorical_cols]),
    columns=encoder.get_feature_names_out(categorical_cols),
    index=test_df.index
)

# Droping original categorical columns and add encoded
train_df = train_df.drop(categorical_cols, axis=1).join(encoded_train)
test_df = test_df.drop(categorical_cols, axis=1).join(encoded_test)

# Droping irrelevant columns
drop_cols = ['PassengerId', 'Name', 'Ticket']
train_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)

In [25]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   FamilySize  891 non-null    int64  
 7   IsAlone     891 non-null    int32  
 8   Sex_male    891 non-null    float64
 9   Embarked_Q  891 non-null    float64
 10  Embarked_S  891 non-null    float64
 11  Title_Miss  891 non-null    float64
 12  Title_Mr    891 non-null    float64
 13  Title_Mrs   891 non-null    float64
 14  Title_Rare  891 non-null    float64
dtypes: float64(9), int32(1), int64(5)
memory usage: 101.1 KB


In [26]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Age         418 non-null    float64
 2   SibSp       418 non-null    int64  
 3   Parch       418 non-null    int64  
 4   Fare        418 non-null    float64
 5   FamilySize  418 non-null    int64  
 6   IsAlone     418 non-null    int32  
 7   Sex_male    418 non-null    float64
 8   Embarked_Q  418 non-null    float64
 9   Embarked_S  418 non-null    float64
 10  Title_Miss  418 non-null    float64
 11  Title_Mr    418 non-null    float64
 12  Title_Mrs   418 non-null    float64
 13  Title_Rare  418 non-null    float64
dtypes: float64(9), int32(1), int64(4)
memory usage: 44.2 KB


### Scaling and Normalization

In [27]:
from sklearn.preprocessing import StandardScaler

# Defining numerical columns
numerical_cols = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']

# Fiting scaler on train
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

In [29]:
train_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,3,-0.565736,0.432793,-0.473674,-0.700836,0.05916,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1,1,0.663861,0.432793,-0.473674,1.491434,0.05916,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,3,-0.258337,-0.474545,-0.473674,-0.677726,-0.560975,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,1,1,0.433312,0.432793,-0.473674,0.868903,0.05916,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0,3,0.433312,-0.474545,-0.473674,-0.673447,-0.560975,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0


### check Class Imbalance

In [30]:
print(train_df['Survived'].value_counts(normalize=True))

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [31]:
print(train_df['Survived'].value_counts())


Survived
0    549
1    342
Name: count, dtype: int64


In [32]:
train_df.to_csv('scaled_train.csv', index=False)
test_df.to_csv('scaled_test.csv', index=False)
