### Preprocessing the train dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/My Drive/titanic/data/train.csv'
train = pd.read_csv(file_path, low_memory=False)

train.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [None]:
# Calculate the proportion of survivors and non-survivors.
survival_ratio = train['Survived'].value_counts(normalize=True) * 100

# Display the results.
survival_ratio

Unnamed: 0_level_0,proportion
Survived,Unnamed: 1_level_1
0,61.616162
1,38.383838


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Check the number and percentage of missing values.

In [None]:
import pandas as pd

# Calculate the number and percentage of missing values.
missing_values_count = train.isnull().sum()
missing_percentage = (missing_values_count / len(train)) * 100

# Create a DataFrame.
missing_data = pd.DataFrame({
    'Number of missing values': missing_values_count,
    'Percentage of missing values(%)': missing_percentage
}).sort_values(by='Percentage of missing values(%)', ascending=False)

# Display the top 15 rows.
print(missing_data.head(15))


             Number of missing values  Percentage of missing values(%)
Cabin                             687                        77.104377
Age                               177                        19.865320
Embarked                            2                         0.224467
PassengerId                         0                         0.000000
Survived                            0                         0.000000
Pclass                              0                         0.000000
Name                                0                         0.000000
Sex                                 0                         0.000000
SibSp                               0                         0.000000
Parch                               0                         0.000000
Ticket                              0                         0.000000
Fare                                0                         0.000000


### Drop unnecessary columns.

In [None]:
# List of columns to drop.
columns_to_drop = ['PassengerId', 'Cabin']

# Remove from the DataFrame.
train = train.drop(columns=columns_to_drop)

# Verify the results after deletion.
print("Number of columns after deletion:", train.shape[1])


Number of columns after deletion: 10


### Check basic information.

In [None]:
def data_summary(df):
    print("Basic data information:")
    print(train.info())
    print("\nNumber of missing values:")
    print(train.isnull().sum().sort_values(ascending=False))
    print("\nStatistical summary:")
    print(train.describe())

# summary
data_summary(train)


Basic data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB
None

Number of missing values:
Age         177
Embarked      2
Survived      0
Pclass        0
Name          0
Sex           0
SibSp         0
Parch         0
Ticket        0
Fare          0
dtype: int64

Statistical summary:
         Survived      Pclass         Age       SibSp       Parch        Fare
count  891.000000  891.000000  714.

### Check the unique values in each column.

In [None]:
# Check the number of unique values in each column
unique_values = train.nunique().sort_values(ascending=False)
print(unique_values)


Name        891
Ticket      681
Fare        248
Age          88
SibSp         7
Parch         7
Pclass        3
Embarked      3
Survived      2
Sex           2
dtype: int64


### Check unique values by column.

In [None]:
# Function to check unique values for a specified column.
def check_unique_values(column_name):
    if column_name not in train.columns:
        print(f"{column_name} does not exist in the DataFrame.")
        return
    unique_values = train[column_name].unique()
    unique_count = len(unique_values)
    print(f"{column_name}: {unique_count} unique values")
    if unique_count > 50:
        print(f"First 10 values: {unique_values[:100]}")
    else:
        print(f"All values: {unique_values}")

#  Check the unique values of a column
check_unique_values('Fare')


Fare: 248 unique values
First 10 values: [  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55    31.275    7.8542  16.      29.125
  13.      18.       7.225   26.       8.0292  35.5     31.3875 263.
   7.8792   7.8958  27.7208 146.5208   7.75    10.5     82.1708  52.
   7.2292  11.2417   9.475   21.      41.5792  15.5     21.6792  17.8
  39.6875   7.8     76.7292  61.9792  27.75    46.9     80.      83.475
  27.9     15.2458   8.1583   8.6625  73.5     14.4542  56.4958   7.65
  29.      12.475    9.       9.5      7.7875  47.1     15.85    34.375
  61.175   20.575   34.6542  63.3583  23.      77.2875   8.6542   7.775
  24.15     9.825   14.4583 247.5208   7.1417  22.3583   6.975    7.05
  14.5     15.0458  26.2833   9.2167  79.2      6.75    11.5     36.75
   7.7958  12.525   66.6      7.3125  61.3792   7.7333  69.55    16.1
  15.75    20.525   55.      25.925 ]


### Extract the title from the 'Name' column

In [None]:
import re

train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


### Map the general categories for the 'Title' column.

In [None]:
# mapping dictionary
title_mapping = {
    'Mr': 0,         # Male
    'Miss': 1,       # Unmarried female
    'Mrs': 2,        # Married female
    'Master': 3,     # Young male (boy)
    'Dr': 0,         # Male
    'Ms': 2,         # Married female (align Ms with Mrs)
    'Sir': 0,        # Male
    'Lady': 1,       # Female (noble class)
    'Mme': 2,        # Married female
    'Don': 0,        # Male (Spanish origin)
    'Jonkheer': 0,   # Male (Dutch origin)
    'Rev': 0,        # Male (clergy)
    'Mlle': 1,       # Unmarried female (equivalent to Miss)
    'Major': 0,      # Male (military major)
    'Col': 0,        # Male (military colonel)
    'Capt': 0,       # Male (captain)
    'Countess': 2    # Married female (countess)
}

# Map the Title column
train['Mapped_Title'] = train['Title'].map(title_mapping)

# Verify
print(train[['Title', 'Mapped_Title']])



    Title  Mapped_Title
0      Mr             0
1     Mrs             2
2    Miss             1
3     Mrs             2
4      Mr             0
..    ...           ...
886   Rev             0
887  Miss             1
888  Miss             1
889    Mr             0
890    Mr             0

[891 rows x 2 columns]


In [None]:
# fill missing age with median age for each title (Mr, Mrs, Miss, Others)
train["Age"].fillna(train.groupby("Mapped_Title")["Age"].transform("median"), inplace=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Name          891 non-null    object 
 3   Sex           891 non-null    object 
 4   Age           891 non-null    float64
 5   SibSp         891 non-null    int64  
 6   Parch         891 non-null    int64  
 7   Ticket        891 non-null    object 
 8   Fare          891 non-null    float64
 9   Embarked      889 non-null    object 
 10  Title         891 non-null    object 
 11  Mapped_Title  891 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(train.groupby("Mapped_Title")["Age"].transform("median"), inplace=True)


### Combine 'SibSp' and 'Parch' to create a new feature.

In [None]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

### Fill missing values in 'Embarked' with 'missing' and convert it to a categorical type.

In [None]:
# Fill missing values in 'Embarked' with 'missing'.
train = train.assign(Embarked=train['Embarked'].fillna('missing'))


# Convert 'Sex' and 'Embarked' to categorical types.
train['Sex'] = train['Sex'].astype('category')
train['Embarked'] = train['Embarked'].astype('category')


### Extract the alphabetical and numerical parts of the 'Ticket' and treat them as separate features

In [None]:
import re

# Extract the prefix (non-numeric part) of the 'Ticket' column.
def extract_ticket_prefix(ticket):
    match = re.match(r"([a-zA-Z./]+)", ticket)
    return match.group(0) if match else 'UNKNOWN'

# Extract the numeric part of the 'Ticket' column.
def extract_ticket_number(ticket):
    numbers = re.findall(r"\d+", ticket)
    return int(numbers[-1]) if numbers else 0

train['TicketPrefix'] = train['Ticket'].apply(extract_ticket_prefix)
train['TicketNumber'] = train['Ticket'].apply(extract_ticket_number)


### Calculate the length (number of digits) of the ticket number.

In [None]:
# Function to calculate the number of digits
train['TicketNumberLength'] = train['TicketNumber'].apply(lambda x: len(str(x)))

# Check the distribution of the number of digits
print(train['TicketNumberLength'].value_counts())



TicketNumberLength
6    423
5    246
4    165
7     44
3      7
1      6
Name: count, dtype: int64


In [None]:
# Convert the number of digits into categorical variables (e.g., single-digit, triple-digit, quadruple-digit, quintuple-digit, sextuple-digit, septuple-digit)
def categorize_ticket_length(length):
    if length <= 1:
        return 'single-digit'
    elif length == 3:
        return 'triple-digit'
    elif length == 4:
        return 'quadruple-digit'
    elif length == 5:
        return 'quintuple-digit'
    elif length == 6:
        return 'sextuple-digit'
    else:
        return 'septuple-digit'

# Treat the number of digits directly as a categorical variable
train['TicketNumberLengthGroup'] = train['TicketNumberLength'].apply(categorize_ticket_length)
train['TicketNumberLengthGroup'] = train['TicketNumberLengthGroup'].astype('category')


### Extract the first 2 digits based on the number of digits

In [None]:
import pandas as pd

# Function to extract the first 2 digits considering the number of digits
def extract_ticket_prefix_with_length(ticket_number):
    ticket_str = str(ticket_number)
    length = len(ticket_str)

    # If the number has 2 or more digits, extract the first 2 digits
    if length >= 2:
        return f"{length}_{ticket_str[:2]}"
    # If it's a single-digit number, keep it as is
    else:
        return f"{length}_{ticket_str}"

# Create a column to count the number of digits
train['TicketNumberLength'] = train['TicketNumber'].apply(lambda x: len(str(x)))

# Extract the first 2 digits based on the number of digits
train['TicketNumberPrefix'] = train['TicketNumber'].apply(extract_ticket_prefix_with_length)

# Convert to categorical type
train['TicketNumberPrefix'] = train['TicketNumberPrefix'].astype('category')

# Verify the results
print("List of categories:")
print(train['TicketNumberPrefix'].value_counts())



List of categories:
TicketNumberPrefix
6_34    128
5_17     75
6_11     72
4_26     59
7_31     44
       ... 
4_62      1
4_65      1
5_10      1
4_92      1
4_32      1
Name: count, Length: 85, dtype: int64


In [None]:
# Function to check unique values for a specified column.
def check_unique_values(column_name):
    if column_name not in train.columns:
        print(f"{column_name} does not exist in the DataFrame.")
        return
    unique_values = train[column_name].unique()
    unique_count = len(unique_values)
    print(f"{column_name}: {unique_count} unique values")
    if unique_count > 50:
        print(f"First 10 values: {unique_values[:100]}")
    else:
        print(f"All values: {unique_values}")

# Check unique values of a column.
check_unique_values('TicketNumberPrefix')

TicketNumberPrefix: 85 unique values
First 10 values: ['5_21', '5_17', '7_31', '6_11', '6_37', ..., '4_84', '4_65', '1_3', '4_22', '4_55']
Length: 85
Categories (85, object): ['1_0', '1_3', '3_54', '3_69', ..., '6_37', '6_38', '6_39', '7_31']


In [None]:
# Drop unnecessary columns
train = train.drop(columns=['Name', 'Title', 'Ticket', 'TicketNumberLength'])

# Convert TicketPrefix to categorical type
train['TicketPrefix'] = train['TicketPrefix'].astype('category')

# Verify the results
print(train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Survived                 891 non-null    int64   
 1   Pclass                   891 non-null    int64   
 2   Sex                      891 non-null    category
 3   Age                      891 non-null    float64 
 4   SibSp                    891 non-null    int64   
 5   Parch                    891 non-null    int64   
 6   Fare                     891 non-null    float64 
 7   Embarked                 891 non-null    category
 8   Mapped_Title             891 non-null    int64   
 9   FamilySize               891 non-null    int64   
 10  TicketPrefix             891 non-null    category
 11  TicketNumber             891 non-null    int64   
 12  TicketNumberLengthGroup  891 non-null    category
 13  TicketNumberPrefix       891 non-null    category
dtypes: categor

### Convert 'male' to 0 and 'female' to 1 in the 'Sex' column

In [None]:
# Map the Sex column
train['Mapped_Sex'] = train['Sex'].map({'male': 0, 'female': 1})

# Drop unnecessary columns
train = train.drop(columns=['Sex'])

# Verify
train.head()



Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Mapped_Title,FamilySize,TicketPrefix,TicketNumber,TicketNumberLengthGroup,TicketNumberPrefix,Mapped_Sex
0,0,3,22.0,1,0,7.25,S,0,2,A/,21171,quintuple-digit,5_21,0
1,1,1,38.0,1,0,71.2833,C,2,2,PC,17599,quintuple-digit,5_17,1
2,1,3,26.0,0,0,7.925,S,1,1,STON/O,3101282,septuple-digit,7_31,1
3,1,1,35.0,1,0,53.1,S,2,2,UNKNOWN,113803,sextuple-digit,6_11,1
4,0,3,35.0,0,0,8.05,S,0,1,UNKNOWN,373450,sextuple-digit,6_37,0


### Create the age_pclass_group feature

In [None]:
# Group ages into ranges (skip this step if already done)
bins = [0, 20, 30, 40, 50, 60, 70, 80]
labels = ['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70+']
train['AgeGroup'] = pd.cut(train['Age'], bins=bins, labels=labels)

# Combine AgeGroup and Pclass to create a new feature
train['Age_Pclass_Group'] = train['AgeGroup'].astype(str) + '_Pclass_' + train['Pclass'].astype(str)

# Verify
print(train[['Age', 'Pclass', 'AgeGroup', 'Age_Pclass_Group']].head())


    Age  Pclass AgeGroup Age_Pclass_Group
0  22.0       3    20-30   20-30_Pclass_3
1  38.0       1    30-40   30-40_Pclass_1
2  26.0       3    20-30   20-30_Pclass_3
3  35.0       1    30-40   30-40_Pclass_1
4  35.0       3    30-40   30-40_Pclass_3


### Create 'FareGroup' feature

In [None]:
# Set bins for Fare
fare_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, train['Fare'].max() + 1]

fare_labels = [
    '0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70',
    '70-80', '80-90', '90-100', '100-200', '200+'
]

# Bin Fare into groups (set right=False to exclude the right edge)
train['FareGroup'] = pd.cut(train['Fare'], bins=fare_bins, labels=fare_labels, right=False)

# Categorize FamilySize
train['FamilySizeGroup'] = pd.cut(
    train['FamilySize'], bins=[0, 1, 4, 15], labels=['Alone', 'Small', 'Large']
)

# Create Age * Pclass feature
train['Age_Pclass'] = train['Age'] * train['Pclass']

# Verify results for FareGroup
print(train['FareGroup'].value_counts())
print(train['FareGroup'].isna().sum())  # Check for missing values

# Verify results for FamilySizeGroup
print(train['FamilySizeGroup'].value_counts())
print(train['FamilySizeGroup'].isna().sum())  # Check for missing values



FareGroup
0-10       336
10-20      179
20-30      136
30-40       64
50-60       39
100-200     33
70-80       29
200+        20
60-70       17
40-50       15
80-90       15
90-100       8
Name: count, dtype: int64
0
FamilySizeGroup
Alone    537
Small    292
Large     62
Name: count, dtype: int64
0


### Calculate 'FamilySize_pclass_total'

In [None]:
# Initialize a dictionary to store counts
family_size_pclass_counts = {}

# Calculate the total number of Pclass for each FamilySizeGroup and Pclass combination
for group in ['Alone', 'Small', 'Large']:
    for pclass in [1, 2, 3]:
        count = train[(train['FamilySizeGroup'] == group) & (train['Pclass'] == pclass)].shape[0]
        family_size_pclass_counts[(group, pclass)] = count

# Display the results
for (group, pclass), count in family_size_pclass_counts.items():
    print(f"Total count for {group} & Pclass={pclass}: {count}")


Total count for Alone & Pclass=1: 109
Total count for Alone & Pclass=2: 104
Total count for Alone & Pclass=3: 324
Total count for Small & Pclass=1: 101
Total count for Small & Pclass=2: 78
Total count for Small & Pclass=3: 113
Total count for Large & Pclass=1: 6
Total count for Large & Pclass=2: 2
Total count for Large & Pclass=3: 54


### Create the 'FamilySize_pclass_total' column

In [None]:
import pandas as pd

# Dictionary containing the total counts for each FamilySizeGroup and Pclass combination
family_size_pclass_counts = {
    ('Alone', 1): 109,
    ('Alone', 2): 104,
    ('Alone', 3): 324,
    ('Small', 1): 101,
    ('Small', 2): 78,
    ('Small', 3): 113,
    ('Large', 1): 6,
    ('Large', 2): 2,
    ('Large', 3): 54,
}

# Create the 'family_size_pclass_total' column
train['family_size_pclass_total'] = train.apply(
    lambda row: family_size_pclass_counts.get((row['FamilySizeGroup'], row['Pclass']), 0),
    axis=1
)

# Verify the results
print(train[['FamilySizeGroup', 'Pclass', 'family_size_pclass_total']].head())



  FamilySizeGroup  Pclass  family_size_pclass_total
0           Small       3                       113
1           Small       1                       101
2           Alone       3                       324
3           Small       1                       101
4           Alone       3                       324


### Create family_size_pclass_survived feature

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# heck if the required columns exist
required_columns = ['FamilySizeGroup', 'Pclass', 'Survived']
missing_columns = [col for col in required_columns if col not in train.columns]
if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {missing_columns}")

# Function to create the 'family_size_pclass_survived' feature
def add_family_size_pclass_survived_feature(df, group_cols, target_col):
    """
    Prevent data leakage while creating the 'family_size_pclass_survived' feature.
    """
    # Calculate the average survival rate for each group
    survival_stats = (
        df.groupby(group_cols, observed=True)[target_col]
        .mean()
        .reset_index()
        .rename(columns={target_col: 'family_size_pclass_survived'})
    )
    return survival_stats

# Split the data into training and test datasets
train_data, test_data = train_test_split(train, test_size=0.2, random_state=42)

# Calculate 'family_size_pclass_survived' based on the training data
survival_stats = add_family_size_pclass_survived_feature(
    train_data, group_cols=['FamilySizeGroup', 'Pclass'], target_col='Survived'
)

# Merge the feature into the training dataset
train_data = train_data.merge(survival_stats, on=['FamilySizeGroup', 'Pclass'], how='left')

# Merge the feature into the test dataset
test_data = test_data.merge(survival_stats, on=['FamilySizeGroup', 'Pclass'], how='left')

# Fill in the missing values
train_data['family_size_pclass_survived'] = train_data['family_size_pclass_survived'].fillna(0.0)
test_data['family_size_pclass_survived'] = test_data['family_size_pclass_survived'].fillna(0.0)

# Verify the results
print(train_data[['FamilySizeGroup', 'Pclass', 'family_size_pclass_survived']].head())
print(test_data[['FamilySizeGroup', 'Pclass', 'family_size_pclass_survived']].head())


  FamilySizeGroup  Pclass  family_size_pclass_survived
0           Alone       1                     0.536585
1           Alone       2                     0.364706
2           Alone       3                     0.213740
3           Small       3                     0.402174
4           Large       3                     0.068182
  FamilySizeGroup  Pclass  family_size_pclass_survived
0           Small       3                     0.402174
1           Alone       2                     0.364706
2           Alone       3                     0.213740
3           Small       2                     0.625000
4           Small       3                     0.402174


In [None]:
# Set the file path for saving
train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226.csv'
test_data_path = '/content/drive/My Drive/titanic/data/internal_valid_data_1226.csv'

# Save as a CSV file
train_data.to_csv(train_data_path, index=False)
test_data.to_csv(test_data_path, index=False)

print(f"Train data saved to: {train_data_path}")
print(f"Validation data saved to: {test_data_path}")



Train data saved to: /content/drive/My Drive/titanic/data/internal_train_data_1226.csv
Validation data saved to: /content/drive/My Drive/titanic/data/internal_valid_data_1226.csv


### Apply label encoding to columns with the object data type.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle  # For saving the encoders

# File paths
train_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226.csv'
test_path = '/content/drive/My Drive/titanic/data/internal_valid_data_1226.csv'

# Load the datasets
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Columns to apply label encoding
columns_to_encode = [
    'Embarked', 'TicketPrefix', 'TicketNumberLengthGroup', 'TicketNumberPrefix',
    'AgeGroup', 'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup'
]

# Apply label encoding and handle unseen labels
label_encoders = {}  # Dictionary to store all encoders
for column in columns_to_encode:
    # Initialize the LabelEncoder
    le = LabelEncoder()

    # Combine train and test column values to fit all classes
    combined_values = pd.concat([train[column], test[column]], axis=0).astype(str)
    le.fit(combined_values)

    # Transform train and test data
    train[column] = le.transform(train[column].astype(str))
    test[column] = le.transform(test[column].astype(str))

    # Save the encoder to the dictionary
    label_encoders[column] = le

# Save all encoders to a single file
encoders_path = '/content/drive/My Drive/titanic/encoders/label_encoders_0126.pkl'
with open(encoders_path, 'wb') as f:
    pickle.dump(label_encoders, f)
print(f"All encoders saved to: {encoders_path}")

# Save the encoded datasets
train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv'
test_data_path = '/content/drive/My Drive/titanic/data/internal_valid_data_1226_encoded.csv'

train.to_csv(train_data_path, index=False)
test.to_csv(test_data_path, index=False)

print(f"Train data saved to: {train_data_path}")
print(f"Test data saved to: {test_data_path}")


All encoders saved to: /content/drive/My Drive/titanic/encoders/label_encoders_0126.pkl
Train data saved to: /content/drive/My Drive/titanic/data/internal_train_data_1226_encoded.csv
Test data saved to: /content/drive/My Drive/titanic/data/internal_valid_data_1226_encoded.csv
