In [6]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import re

file_path = '/content/drive/My Drive/titanic/data/test.csv'
test = pd.read_csv(file_path, low_memory=False)

test.head(10)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


### Check the number and percentage of missing values.

In [9]:
import pandas as pd

# Calculate the number and percentage of missing values.
missing_values_count = test.isnull().sum()
missing_percentage = (missing_values_count / len(test)) * 100

# Create a DataFrame.
missing_data = pd.DataFrame({
    'Number of missing values': missing_values_count,
    'Percentage of missing values(%)': missing_percentage
}).sort_values(by='Percentage of missing values(%)', ascending=False)

# Display the top 15 rows.
print(missing_data.head(15))

             Number of missing values  Percentage of missing values(%)
Cabin                             327                        78.229665
Age                                86                        20.574163
Fare                                1                         0.239234
PassengerId                         0                         0.000000
Pclass                              0                         0.000000
Name                                0                         0.000000
Sex                                 0                         0.000000
SibSp                               0                         0.000000
Parch                               0                         0.000000
Ticket                              0                         0.000000
Embarked                            0                         0.000000


### Drop unnecessary columns.

In [10]:
# List of columns to drop.
columns_to_drop = ['Cabin']

# Remove from the DataFrame.
test = test.drop(columns=columns_to_drop)

#  Verify the results after deletion.
print("Number of columns after deletion:", test.shape[1])


Number of columns after deletion: 10


### Check basic information


In [11]:
def data_summary(df):
    print("Basic data information:")
    print(test.info())
    print("\nNumber of missing values:")
    print(test.isnull().sum().sort_values(ascending=False))
    print("\nStatistical summary:")
    print(test.describe())

# Summary
data_summary(test)


Basic data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 32.8+ KB
None

Number of missing values:
Age            86
Fare            1
PassengerId     0
Pclass          0
Name            0
Sex             0
SibSp           0
Parch           0
Ticket          0
Embarked        0
dtype: int64

Statistical summary:
       PassengerId      Pclass         Age       SibSp   

###Check the unique values in each column

In [12]:
# Check the number of unique values in each column
unique_values = test.nunique().sort_values(ascending=False)
print(unique_values)


PassengerId    418
Name           418
Ticket         363
Fare           169
Age             79
Parch            8
SibSp            7
Pclass           3
Embarked         3
Sex              2
dtype: int64


### Check unique values by column.

In [13]:
# Function to check unique values for a specified column.
def check_unique_values(column_name):
    if column_name not in test.columns:
        print(f"{column_name} does not exist in the DataFrame.")
        return
    unique_values = test[column_name].unique()
    unique_count = len(unique_values)
    print(f"{column_name}: {unique_count} unique values")
    if unique_count > 50:
        print(f"First 10 values: {unique_values[:100]}")
    else:
        print(f"All values: {unique_values}")

# Check the unique values of a column
check_unique_values('Name')


Name: 418 unique values
First 10 values: ['Kelly, Mr. James' 'Wilkes, Mrs. James (Ellen Needs)'
 'Myles, Mr. Thomas Francis' 'Wirz, Mr. Albert'
 'Hirvonen, Mrs. Alexander (Helga E Lindqvist)'
 'Svensson, Mr. Johan Cervin' 'Connolly, Miss. Kate'
 'Caldwell, Mr. Albert Francis'
 'Abrahim, Mrs. Joseph (Sophie Halaut Easu)' 'Davies, Mr. John Samuel'
 'Ilieff, Mr. Ylio' 'Jones, Mr. Charles Cresson'
 'Snyder, Mrs. John Pillsbury (Nelle Stevenson)' 'Howard, Mr. Benjamin'
 'Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)'
 'del Carlo, Mrs. Sebastiano (Argenia Genovesi)' 'Keane, Mr. Daniel'
 'Assaf, Mr. Gerios' 'Ilmakangas, Miss. Ida Livija'
 'Assaf Khalil, Mrs. Mariana (Miriam")"' 'Rothschild, Mr. Martin'
 'Olsen, Master. Artur Karl' 'Flegenheim, Mrs. Alfred (Antoinette)'
 'Williams, Mr. Richard Norris II'
 'Ryerson, Mrs. Arthur Larned (Emily Maria Borie)'
 'Robins, Mr. Alexander A' 'Ostby, Miss. Helene Ragnhild'
 'Daher, Mr. Shedid' 'Brady, Mr. John Bertram' 'Samaan, Mr. Elias'
 'Louc

### Extract the title from the 'Name' column

In [14]:
import re

test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


### Map the general categories for the 'Title' column.

In [15]:
# mapping dictionary
title_mapping = {
    'Mr': 0,         # male
    'Mrs': 2,        # Married female
    'Miss': 1,       # Unmarried female
    'Master': 3,     # Young male (boy)
    'Ms': 2,         # Married female (align Ms with Mrs)
    'Col': 0,        # Male (military colonel)
    'Rev': 0,        # Male (clergy)
    'Dr': 0,         # Male
    'Dona': 2        # Honorifics used for women in Spanish- and Portuguese-speaking regions.
}

# Map the Title column
test['Mapped_Title'] = test['Title'].map(title_mapping)

# Verify
print(test[['Title', 'Mapped_Title']])



      Title  Mapped_Title
0        Mr             0
1       Mrs             2
2        Mr             0
3        Mr             0
4       Mrs             2
..      ...           ...
413      Mr             0
414    Dona             2
415      Mr             0
416      Mr             0
417  Master             3

[418 rows x 2 columns]


### fill missing age with median age for each mapped_title

In [16]:
test["Age"].fillna(test.groupby("Mapped_Title")["Age"].transform("median"), inplace=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   418 non-null    int64  
 1   Pclass        418 non-null    int64  
 2   Name          418 non-null    object 
 3   Sex           418 non-null    object 
 4   Age           418 non-null    float64
 5   SibSp         418 non-null    int64  
 6   Parch         418 non-null    int64  
 7   Ticket        418 non-null    object 
 8   Fare          417 non-null    float64
 9   Embarked      418 non-null    object 
 10  Title         418 non-null    object 
 11  Mapped_Title  418 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["Age"].fillna(test.groupby("Mapped_Title")["Age"].transform("median"), inplace=True)


### Fill missing values in the Fare column

In [17]:
# Identify rows with missing values
missing_fare_row = test[test['Fare'].isna()]
print("Rows with missing values:")
print(missing_fare_row)

# Calculate the mean fare based on conditions (Embarked is 'S' and Pclass is 3)
mean_fare = test[(test['Embarked'] == 'S') & (test['Pclass'] == 3)]['Fare'].mean()

print(f"Mean fare for Embarked='S' and Pclass=3: {mean_fare:.2f}")

# Fill the missing value
test.loc[test['Fare'].isna(), 'Fare'] = mean_fare

# Verify after imputation
test.info()



Rows with missing values:
     PassengerId  Pclass                Name   Sex   Age  SibSp  Parch Ticket  \
152         1044       3  Storey, Mr. Thomas  male  60.5      0      0   3701   

     Fare Embarked Title  Mapped_Title  
152   NaN        S    Mr             0  
Mean fare for Embarked='S' and Pclass=3: 13.91
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   418 non-null    int64  
 1   Pclass        418 non-null    int64  
 2   Name          418 non-null    object 
 3   Sex           418 non-null    object 
 4   Age           418 non-null    float64
 5   SibSp         418 non-null    int64  
 6   Parch         418 non-null    int64  
 7   Ticket        418 non-null    object 
 8   Fare          418 non-null    float64
 9   Embarked      418 non-null    object 
 10  Title         418 non-null    object 
 11  Mapped_Title 

### Combine 'SibSp' and 'Parch' to create a new feature.

In [18]:
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

### Fill missing values in 'Embarked' with 'missing' and convert it to a categorical type.

In [19]:
# Fill missing values in 'Embarked' with 'missing'.
test = test.assign(Embarked=test['Embarked'].fillna('missing'))


# Convert 'Sex' and 'Embarked' to categorical types.
test['Sex'] = test['Sex'].astype('category')
test['Embarked'] = test['Embarked'].astype('category')


### Extract the alphabetical and numerical parts of the 'Ticket' and treat them as separate features

In [20]:
import re

# Extract the prefix (non-numeric part) of the 'Ticket' column.
def extract_ticket_prefix(ticket):
    match = re.match(r"([a-zA-Z./]+)", ticket)
    return match.group(0) if match else 'UNKNOWN'

# Extract the numeric part of the 'Ticket' column.
def extract_ticket_number(ticket):
    numbers = re.findall(r"\d+", ticket)
    return int(numbers[-1]) if numbers else 0

test['TicketPrefix'] = test['Ticket'].apply(extract_ticket_prefix)
test['TicketNumber'] = test['Ticket'].apply(extract_ticket_number)


### Calculate the length (number of digits) of the ticket number.

In [21]:
# Function to calculate the number of digits
test['TicketNumberLength'] = test['TicketNumber'].apply(lambda x: len(str(x)))

# Check the distribution of the number of digits
print(test['TicketNumberLength'].value_counts())


TicketNumberLength
6    183
5    131
4     80
7     16
3      6
1      2
Name: count, dtype: int64


In [22]:
# Convert the number of digits into categorical variables (e.g., single-digit, triple-digit, quadruple-digit, quintuple-digit, sextuple-digit, septuple-digit)
def categorize_ticket_length(length):
    if length <= 1:
        return 'single-digit'
    elif length == 3:
        return 'triple-digit'
    elif length == 4:
        return 'quadruple-digit'
    elif length == 5:
        return 'quintuple-digit'
    elif length == 6:
        return 'sextuple-digit'
    else:
        return 'septuple-digit'

# Treat the number of digits directly as a categorical variable
test['TicketNumberLengthGroup'] = test['TicketNumberLength'].apply(categorize_ticket_length)
test['TicketNumberLengthGroup'] = test['TicketNumberLength'].astype('category')



### Extract the first 2 digits based on the number of digits

In [23]:
import pandas as pd

# Function to extract the first 2 digits considering the number of digits
def extract_ticket_prefix_with_length(ticket_number):
    ticket_str = str(ticket_number)
    length = len(ticket_str)

    # If the number has 2 or more digits, extract the first 2 digits
    if length >= 2:
        return f"{length}_{ticket_str[:2]}"
    # If it's a single-digit number, keep it as is
    else:
        return f"{length}_{ticket_str}"

# Create a column to count the number of digits
test['TicketNumberLength'] = test['TicketNumber'].apply(lambda x: len(str(x)))

# Extract the first 2 digits based on the number of digits
test['TicketNumberPrefix'] = test['TicketNumber'].apply(extract_ticket_prefix_with_length)

# Convert to categorical type
test['TicketNumberPrefix'] = test['TicketNumberPrefix'].astype('category')

# Verify the results
print("カテゴリの一覧:")
print(test['TicketNumberPrefix'].value_counts())


カテゴリの一覧:
TicketNumberPrefix
6_34    46
5_17    38
4_26    34
6_11    29
6_36    17
        ..
4_72     1
5_23     1
4_79     1
4_92     1
5_15     1
Name: count, Length: 75, dtype: int64


In [24]:
# Drop unnecessary columns
test = test.drop(columns=['Name', 'Title', 'Ticket', 'TicketNumberLength'])

# Convert TicketPrefix to categorical type
test['TicketPrefix'] = test['TicketPrefix'].astype('category')

# Verify the results
print(test.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   PassengerId              418 non-null    int64   
 1   Pclass                   418 non-null    int64   
 2   Sex                      418 non-null    category
 3   Age                      418 non-null    float64 
 4   SibSp                    418 non-null    int64   
 5   Parch                    418 non-null    int64   
 6   Fare                     418 non-null    float64 
 7   Embarked                 418 non-null    category
 8   Mapped_Title             418 non-null    int64   
 9   FamilySize               418 non-null    int64   
 10  TicketPrefix             418 non-null    category
 11  TicketNumber             418 non-null    int64   
 12  TicketNumberLengthGroup  418 non-null    category
 13  TicketNumberPrefix       418 non-null    category
dtypes: categor

### Convert 'male' to 0 and 'female' to 1 in the 'Sex' column

In [25]:
# Map the Sex column
test['Mapped_Sex'] = test['Sex'].map({'male': 0, 'female': 1})

test = test.drop(columns=['Sex'])

test.head()


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,Mapped_Title,FamilySize,TicketPrefix,TicketNumber,TicketNumberLengthGroup,TicketNumberPrefix,Mapped_Sex
0,892,3,34.5,0,0,7.8292,Q,0,1,UNKNOWN,330911,6,6_33,0
1,893,3,47.0,1,0,7.0,S,2,2,UNKNOWN,363272,6,6_36,1
2,894,2,62.0,0,0,9.6875,Q,0,1,UNKNOWN,240276,6,6_24,0
3,895,3,27.0,0,0,8.6625,S,0,1,UNKNOWN,315154,6,6_31,0
4,896,3,22.0,1,1,12.2875,S,2,3,UNKNOWN,3101298,7,7_31,1


### Create the age_pclass_group feature

In [26]:
# Group ages into ranges (skip this step if already done)
bins = [0, 20, 30, 40, 50, 60, 70, 80]
labels = ['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70+']
test['AgeGroup'] = pd.cut(test['Age'], bins=bins, labels=labels)

# Combine AgeGroup and Pclass to create a new feature
test['Age_Pclass_Group'] = test['AgeGroup'].astype(str) + '_Pclass_' + test['Pclass'].astype(str)

# Verify
print(test[['Age', 'Pclass', 'AgeGroup', 'Age_Pclass_Group']].head())

    Age  Pclass AgeGroup Age_Pclass_Group
0  34.5       3    30-40   30-40_Pclass_3
1  47.0       3    40-50   40-50_Pclass_3
2  62.0       2    60-70   60-70_Pclass_2
3  27.0       3    20-30   20-30_Pclass_3
4  22.0       3    20-30   20-30_Pclass_3


### Create 'FareGroup' feature

In [27]:
# Set bins for Fare
fare_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, test['Fare'].max() + 1]

fare_labels = [
    '0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70',
    '70-80', '80-90', '90-100', '100-200', '200+'
]

# Bin Fare into groups (set right=False to exclude the right edge)
test['FareGroup'] = pd.cut(test['Fare'], bins=fare_bins, labels=fare_labels, right=False)

# Categorize FamilySize
test['FamilySizeGroup'] = pd.cut(
    test['FamilySize'], bins=[0, 1, 4, 15], labels=['Alone', 'Small', 'Large']
)

# Create Age * Pclass feature
test['Age_Pclass'] = test['Age'] * test['Pclass']


print(test['FareGroup'].value_counts())
print(test['FareGroup'].isna().sum())  # 欠損値がないか確認


print(test['FamilySizeGroup'].value_counts())
print(test['FamilySizeGroup'].isna().sum())  # 欠損値がないか確認




FareGroup
0-10       155
10-20       83
20-30       70
30-40       22
200+        18
50-60       17
60-70       13
100-200     13
70-80       10
40-50        7
80-90        7
90-100       3
Name: count, dtype: int64
0
FamilySizeGroup
Alone    253
Small    145
Large     20
Name: count, dtype: int64
0


### Calculate 'FamilySize_pclass_total'

In [28]:
# Initialize a dictionary to store counts
family_size_pclass_counts = {}

# Calculate the total number of Pclass for each FamilySizeGroup and Pclass combination
for group in ['Alone', 'Small', 'Large']:
    for pclass in [1, 2, 3]:
        count = test[(test['FamilySizeGroup'] == group) & (test['Pclass'] == pclass)].shape[0]
        family_size_pclass_counts[(group, pclass)] = count

# Display the results
for (group, pclass), count in family_size_pclass_counts.items():
    print(f"{group} & Pclass={pclass} の総数: {count}")


Alone & Pclass=1 の総数: 51
Alone & Pclass=2 の総数: 54
Alone & Pclass=3 の総数: 148
Small & Pclass=1 の総数: 51
Small & Pclass=2 の総数: 39
Small & Pclass=3 の総数: 55
Large & Pclass=1 の総数: 5
Large & Pclass=2 の総数: 0
Large & Pclass=3 の総数: 15


### Create the 'FamilySize_pclass_total' column

In [29]:
import pandas as pd

# Dictionary containing the total counts for each FamilySizeGroup and Pclass combination
family_size_pclass_counts = {
    ('Alone', 1): 51,
    ('Alone', 2): 54,
    ('Alone', 3): 148,
    ('Small', 1): 51,
    ('Small', 2): 39,
    ('Small', 3): 55,
    ('Large', 1): 5,
    ('Large', 2): 0,
    ('Large', 3): 15,
}

# Create the 'family_size_pclass_total' column
test['family_size_pclass_total'] = test.apply(
    lambda row: family_size_pclass_counts.get((row['FamilySizeGroup'], row['Pclass']), 0),
    axis=1
)


print(test[['FamilySizeGroup', 'Pclass', 'family_size_pclass_total']].head())

  FamilySizeGroup  Pclass  family_size_pclass_total
0           Alone       3                       148
1           Small       3                        55
2           Alone       2                        54
3           Alone       3                       148
4           Small       3                        55


### Create family_size_pclass_survived feature

In [30]:
import pandas as pd

# trainデータセットの読み込み
internal_train_data_path = '/content/drive/My Drive/titanic/data/internal_train_data_1226.csv'
internal_train_data = pd.read_csv(internal_train_data_path, low_memory=False)



required_columns = ['FamilySizeGroup', 'Pclass', 'Survived']
missing_columns = [col for col in required_columns if col not in internal_train_data.columns]
if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {missing_columns}")

# Calculate survival rates from the internal training dataset
survival_stats = internal_train_data.groupby(['FamilySizeGroup', 'Pclass'])['Survived'].mean().reset_index()
survival_stats.rename(columns={'Survived': 'family_size_pclass_survived'}, inplace=True)

# Merge survival rates into the actual test dataset
test = test.merge(survival_stats, on=['FamilySizeGroup', 'Pclass'], how='left')

# Fill missing values
test['family_size_pclass_survived'] = test['family_size_pclass_survived'].fillna(0.0)

# Verify the results
print(test[['FamilySizeGroup', 'Pclass', 'family_size_pclass_survived']].head())


  FamilySizeGroup  Pclass  family_size_pclass_survived
0           Alone       3                     0.213740
1           Small       3                     0.402174
2           Alone       2                     0.364706
3           Alone       3                     0.213740
4           Small       3                     0.402174


In [31]:
file_path = '/content/drive/My Drive/titanic/data/test_0126.csv'
test.to_csv(file_path, index=False)

print(f"データセットを {file_path} に保存しました。")

test.info()

データセットを /content/drive/My Drive/titanic/data/test_0126.csv に保存しました。
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   PassengerId                  418 non-null    int64   
 1   Pclass                       418 non-null    int64   
 2   Age                          418 non-null    float64 
 3   SibSp                        418 non-null    int64   
 4   Parch                        418 non-null    int64   
 5   Fare                         418 non-null    float64 
 6   Embarked                     418 non-null    category
 7   Mapped_Title                 418 non-null    int64   
 8   FamilySize                   418 non-null    int64   
 9   TicketPrefix                 418 non-null    category
 10  TicketNumber                 418 non-null    int64   
 11  TicketNumberLengthGroup      418 non-null    category
 

### Apply label encoding to columns with the object data type.

In [33]:
import pandas as pd
import pickle  # For loading the encoders

# File paths
test_path = '/content/drive/My Drive/titanic/data/test_0126.csv'

# Load the test dataset
test = pd.read_csv(test_path)

# Columns to apply label encoding
columns_to_encode = [
    'Embarked', 'TicketPrefix', 'TicketNumberLengthGroup', 'TicketNumberPrefix',
    'AgeGroup', 'Age_Pclass_Group', 'FareGroup', 'FamilySizeGroup'
]

# Load saved label encoders
label_encoders_path = '/content/drive/My Drive/titanic/encoders/label_encoders_0126.pkl'
with open(label_encoders_path, 'rb') as f:
    label_encoders = pickle.load(f)

# Apply label encoding and handle unseen labels
for column in columns_to_encode:
    if column in label_encoders:
        le = label_encoders[column]
        # Transform with -1 for unseen labels
        test[column] = test[column].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    else:
        raise ValueError(f"Label encoder for column '{column}' is missing.")

# Save the encoded dataset
test_data_path = '/content/drive/My Drive/titanic/data/test_1226_encoded.csv'
test.to_csv(test_data_path, index=False)

print(f"Test data saved to: {test_data_path}")


Test data saved to: /content/drive/My Drive/titanic/data/test_1226_encoded.csv
