In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")
test1 = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\test.csv")

In [3]:
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

In [4]:

train.drop('PassengerId', axis=1, inplace=True)
test = test1.drop('PassengerId', axis=1)

In [5]:
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

In [6]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    int64  
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    int64  
 3   Age       332 non-nu

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(train[['Fare']])
test['Fare'] = imputer.transform(test[['Fare']])


In [8]:
train['Fare'] = np.log1p(train['Fare'])
test['Fare'] = np.log1p(test['Fare'])

In [9]:
import re
def add_title_column(df):
    """
    Cleans the 'Name' column and adds a 'Title' column directly 
    to the input DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to modify in-place.
    """
    # 1. Clean the Name column (remove text in parentheses and quotes)
    # This series is temporary and will be used to build the 'Title'
    cleaned_names = df['Name'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())
    cleaned_names = cleaned_names.str.replace(r'"[^"]*"', '', regex=True).str.strip()

    # 2. Extract the part of the name after the comma
    name_part = cleaned_names.str.split(',').str.get(1)

    # 3. Extract the Title from the remaining part of the name
    extracted_title = name_part.str.split('.').str.get(0).str.strip()
    
    # 4. Standardize the common titles
    title_mapping = {
        'Mlle': 'Miss',
        'Ms': 'Miss',
        'Mme': 'Mrs'
    }
    extracted_title = extracted_title.replace(title_mapping)

    # 5. Define a list of common titles
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']

    # 6. Create the 'Title' column directly on the DataFrame
    # Categorize any title not in common_titles as 'Rare'
    df['Titel'] = extracted_title.apply(lambda x: x if x in common_titles else 'Rare')
    df.drop('Name', axis=1, inplace=True)

In [10]:

add_title_column(train)
add_title_column(test)



In [11]:
test['Pclass'].value_counts()

Pclass
3    218
1    107
2     93
Name: count, dtype: int64

In [12]:
train['Age'] = train['Age'].fillna(train.groupby(['Pclass', 'Titel'])['Age'].transform('mean'))

In [13]:
age_means_mapping = train.groupby(['Pclass', 'Titel'])['Age'].mean()
impute_values = test.set_index(['Pclass', 'Titel']).index.map(age_means_mapping)
test['Age'] = test['Age'].fillna(pd.Series(impute_values, index=test.index))
test['Age'] = test['Age'].fillna(train['Age'].mean())


In [14]:
train['Age*Class'] = train['Age'] * train['Pclass']
train['Age*Fare'] = train['Age'] * train['Fare']    
test['Age*Class'] = test['Age'] * test['Pclass']
test['Age*Fare'] = test['Age'] * test['Fare']    

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
column_to_encode = 'Titel' 
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# 2. Fit and transform the data
# The double brackets around the column name create a DataFrame slice, which is the expected input format.
one_hot_encoded = ohe.fit_transform(train[[column_to_encode]])

# 3. Create a new DataFrame with the one-hot encoded columns
# The `get_feature_names_out()` method provides meaningful names for the new columns.
one_hot_df = pd.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out([column_to_encode]))

# 4. Concatenate the new DataFrame with the original DataFrame
# We use `train.index` to ensure the rows align correctly.
train = pd.concat([train, one_hot_df], axis=1)

# 5. Drop the original column
train = train.drop(column_to_encode, axis=1)

In [17]:
one_hot_encoded = ohe.transform(test[[column_to_encode]])

# 3. Create a new DataFrame with the one-hot encoded columns
# The `get_feature_names_out()` method provides meaningful names for the new columns.
one_hot_df = pd.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out([column_to_encode]))

# 4. Concatenate the new DataFrame with the original DataFrame
# We use `test.index` to ensure the rows align correctly.
test = pd.concat([test, one_hot_df], axis=1)

# 5. Drop the original column
test = test.drop(column_to_encode, axis=1)

In [18]:
test['Pclass'].value_counts()
train['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [19]:
train['Cabin'] = train['Cabin'].astype(str).str[0].to_frame(name='Cabin')
test['Cabin'] = test['Cabin'].astype(str).str[0].to_frame(name='Cabin')

In [20]:
train = train.drop('Ticket', axis=1)
test = test.drop('Ticket', axis=1)

In [21]:
import category_encoders as ce

In [22]:
# hashing_encoder = ce.HashingEncoder(n_components=16, cols=['Ticket'])

In [23]:
# df_hashed = hashing_encoder.fit_transform(train['Ticket'])

In [24]:
# train = train.drop('Ticket', axis=1)
# train = pd.concat([train, df_hashed], axis=1)

In [25]:
# df_hashed = hashing_encoder.fit_transform(test['Ticket'])
# test = test.drop('Ticket', axis=1)
# test = pd.concat([test, df_hashed], axis=1)

In [26]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age*Class,Age*Fare,Titel_Master,Titel_Miss,Titel_Mr,Titel_Mrs,Titel_Rare
0,0,3,0,22.000000,1,0,2.110213,n,S,66.000000,46.424690,0.0,0.0,1.0,0.0,0.0
1,1,1,1,38.000000,1,0,4.280593,C,C,38.000000,162.662539,0.0,0.0,0.0,1.0,0.0
2,1,3,1,26.000000,0,0,2.188856,n,S,78.000000,56.910265,0.0,1.0,0.0,0.0,0.0
3,1,1,1,35.000000,1,0,3.990834,C,S,35.000000,139.679197,0.0,0.0,0.0,1.0,0.0
4,0,3,0,35.000000,0,0,2.202765,n,S,105.000000,77.096767,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,2.639057,n,S,54.000000,71.254548,0.0,0.0,0.0,0.0,1.0
887,1,1,1,19.000000,0,0,3.433987,B,S,19.000000,65.245757,0.0,1.0,0.0,0.0,0.0
888,0,3,1,16.123188,1,2,3.196630,n,S,48.369565,51.539871,0.0,1.0,0.0,0.0,0.0
889,1,1,0,26.000000,0,0,3.433987,C,C,26.000000,89.283667,0.0,0.0,1.0,0.0,0.0


In [27]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age*Class,Age*Fare,Titel_Master,Titel_Miss,Titel_Mr,Titel_Mrs,Titel_Rare
0,3,0,34.500000,0,0,2.178064,n,Q,103.500000,75.143222,0.0,0.0,1.0,0.0,0.0
1,3,1,47.000000,1,0,2.079442,n,S,141.000000,97.733752,0.0,0.0,0.0,1.0,0.0
2,2,0,62.000000,0,0,2.369075,n,Q,124.000000,146.882640,0.0,0.0,1.0,0.0,0.0
3,3,0,27.000000,0,0,2.268252,n,S,81.000000,61.242815,0.0,0.0,1.0,0.0,0.0
4,3,1,22.000000,1,1,2.586824,n,S,66.000000,56.910122,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,28.724891,0,0,2.202765,n,S,86.174672,63.274177,0.0,0.0,1.0,0.0,0.0
414,1,1,39.000000,0,0,4.699571,C,C,39.000000,183.283264,0.0,0.0,0.0,0.0,1.0
415,3,0,38.500000,0,0,2.110213,n,S,115.500000,81.243208,0.0,0.0,1.0,0.0,0.0
416,3,0,28.724891,0,0,2.202765,n,S,86.174672,63.274177,0.0,0.0,1.0,0.0,0.0


In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Sex           891 non-null    int64  
 3   Age           891 non-null    float64
 4   SibSp         891 non-null    int64  
 5   Parch         891 non-null    int64  
 6   Fare          891 non-null    float64
 7   Cabin         891 non-null    object 
 8   Embarked      891 non-null    object 
 9   Age*Class     891 non-null    float64
 10  Age*Fare      891 non-null    float64
 11  Titel_Master  891 non-null    float64
 12  Titel_Miss    891 non-null    float64
 13  Titel_Mr      891 non-null    float64
 14  Titel_Mrs     891 non-null    float64
 15  Titel_Rare    891 non-null    float64
dtypes: float64(9), int64(5), object(2)
memory usage: 111.5+ KB


In [29]:
from sklearn.preprocessing import LabelEncoder

In [30]:
le = LabelEncoder()

In [31]:
test['Embarked'].value_counts()

Embarked
S    270
C    102
Q     46
Name: count, dtype: int64

In [32]:
train['Embarked'] = le.fit_transform(train['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])

In [33]:
test['Embarked'].value_counts()

Embarked
2    270
0    102
1     46
Name: count, dtype: int64

In [34]:
train['Embarked'].value_counts()

Embarked
2    646
0    168
1     77
Name: count, dtype: int64

In [35]:
train['Cabin'] = le.fit_transform(train['Cabin'])
test['Cabin'] = le.transform(test['Cabin'])

In [36]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age*Class,Age*Fare,Titel_Master,Titel_Miss,Titel_Mr,Titel_Mrs,Titel_Rare
0,0,3,0,22.000000,1,0,2.110213,8,2,66.000000,46.424690,0.0,0.0,1.0,0.0,0.0
1,1,1,1,38.000000,1,0,4.280593,2,0,38.000000,162.662539,0.0,0.0,0.0,1.0,0.0
2,1,3,1,26.000000,0,0,2.188856,8,2,78.000000,56.910265,0.0,1.0,0.0,0.0,0.0
3,1,1,1,35.000000,1,0,3.990834,2,2,35.000000,139.679197,0.0,0.0,0.0,1.0,0.0
4,0,3,0,35.000000,0,0,2.202765,8,2,105.000000,77.096767,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,2.639057,8,2,54.000000,71.254548,0.0,0.0,0.0,0.0,1.0
887,1,1,1,19.000000,0,0,3.433987,1,2,19.000000,65.245757,0.0,1.0,0.0,0.0,0.0
888,0,3,1,16.123188,1,2,3.196630,8,2,48.369565,51.539871,0.0,1.0,0.0,0.0,0.0
889,1,1,0,26.000000,0,0,3.433987,2,0,26.000000,89.283667,0.0,0.0,1.0,0.0,0.0


In [37]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age*Class,Age*Fare,Titel_Master,Titel_Miss,Titel_Mr,Titel_Mrs,Titel_Rare
0,3,0,34.500000,0,0,2.178064,8,1,103.500000,75.143222,0.0,0.0,1.0,0.0,0.0
1,3,1,47.000000,1,0,2.079442,8,2,141.000000,97.733752,0.0,0.0,0.0,1.0,0.0
2,2,0,62.000000,0,0,2.369075,8,1,124.000000,146.882640,0.0,0.0,1.0,0.0,0.0
3,3,0,27.000000,0,0,2.268252,8,2,81.000000,61.242815,0.0,0.0,1.0,0.0,0.0
4,3,1,22.000000,1,1,2.586824,8,2,66.000000,56.910122,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,28.724891,0,0,2.202765,8,2,86.174672,63.274177,0.0,0.0,1.0,0.0,0.0
414,1,1,39.000000,0,0,4.699571,2,0,39.000000,183.283264,0.0,0.0,0.0,0.0,1.0
415,3,0,38.500000,0,0,2.110213,8,2,115.500000,81.243208,0.0,0.0,1.0,0.0,0.0
416,3,0,28.724891,0,0,2.202765,8,2,86.174672,63.274177,0.0,0.0,1.0,0.0,0.0


In [38]:
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo


In [39]:

train_Pclass = pd.get_dummies(train['Pclass'], prefix='class', drop_first=True, dtype=int)
train_Embarked = pd.get_dummies(train['Embarked'], prefix='Embarked', drop_first=True, dtype=int)
train = pd.concat([train, train_Pclass, train_Embarked], axis=1)
train.drop(['Pclass','Embarked'], axis=1, inplace=True)

In [40]:
test_Pclass = pd.get_dummies(test['Pclass'], prefix='class', drop_first=True, dtype=int)
test_Embarked = pd.get_dummies(test['Embarked'], prefix='Embarked', drop_first=True, dtype=int)
test = pd.concat([test, test_Pclass, test_Embarked], axis=1)
test.drop(['Pclass','Embarked'], axis=1, inplace=True)

In [41]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

In [42]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
# Ensemble and Boosting models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# --- Model Evaluation Tools ---
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "Ridge Classifier": RidgeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "SVC": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss'),
    "CatBoost": cb.CatBoostClassifier(verbose=0),
    "MLP Classifier": MLPClassifier(max_iter=500, random_state=42)
}

In [43]:
# results = []
# names = []

# print("Evaluating models...")
# for name, model in models.items():
#     # 1. Define the cross-validation strategy
#     # Using StratifiedKFold is good practice for classification problems
#     cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#     # 2. Perform cross-validation
#     cv_results = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

#     # 3. Store the results
#     results.append(cv_results)
#     names.append(name)
# # 
#     # 4. Print the mean and std dev of the results
#     print(f"{name}: {cv_results.mean():.4f} Accuracy with a std of {cv_results.std():.4f}")




In [44]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sex           891 non-null    int64  
 1   Age           891 non-null    float64
 2   SibSp         891 non-null    int64  
 3   Parch         891 non-null    int64  
 4   Fare          891 non-null    float64
 5   Cabin         891 non-null    int32  
 6   Age*Class     891 non-null    float64
 7   Age*Fare      891 non-null    float64
 8   Titel_Master  891 non-null    float64
 9   Titel_Miss    891 non-null    float64
 10  Titel_Mr      891 non-null    float64
 11  Titel_Mrs     891 non-null    float64
 12  Titel_Rare    891 non-null    float64
 13  class_2       891 non-null    int32  
 14  class_3       891 non-null    int32  
 15  Embarked_1    891 non-null    int32  
 16  Embarked_2    891 non-null    int32  
dtypes: float64(9), int32(5), int64(3)
memory usage: 101.1 KB


In [45]:
import numpy as np
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV

# --- Hyperparameter Tuning with GridSearchCV ---

# Initialize the RidgeClassifier
ridge = RidgeClassifier(random_state=42)

# Define the hyperparameter grid to search
# 'alpha' is the most important hyperparameter for Ridge, controlling regularization strength.
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 50.0, 100.0],
    'solver': ['svd', 'cholesky', 'lsqr', 'sag']
}

# Set up GridSearchCV
# cv=5 means 5-fold cross-validation
# return_train_score=True is essential to get the training scores
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, return_train_score=True)

# Fit the grid search to the data
# I am assuming you have X_train and y_train available from your preprocessing steps.
grid_search.fit(X_train, y_train)

# --- Displaying the Results ---

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

# Access the results and calculate the desired ratio
cv_results = grid_search.cv_results_
for i in range(len(cv_results['params'])):
    mean_cv_score = cv_results['mean_test_score'][i]
    mean_train_score = cv_results['mean_train_score'][i]
    
    # Calculate the ratio, handle division by zero if a score is 0
    ratio = mean_cv_score / mean_train_score if mean_train_score > 0 else 0
    
    print("\nParameters: ", cv_results['params'][i])
    print("Mean CV Score: {:.4f}".format(mean_cv_score))
    print("Mean Train Score: {:.4f}".format(mean_train_score))
    print("CV Score / Train Score Ratio: {:.4f}".format(ratio))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found:  {'alpha': 1.0, 'solver': 'svd'}
Best cross-validation score: 0.8305

Parameters:  {'alpha': 0.1, 'solver': 'svd'}
Mean CV Score: 0.8271
Mean Train Score: 0.8364
CV Score / Train Score Ratio: 0.9889

Parameters:  {'alpha': 0.1, 'solver': 'cholesky'}
Mean CV Score: 0.8271
Mean Train Score: 0.8364
CV Score / Train Score Ratio: 0.9889

Parameters:  {'alpha': 0.1, 'solver': 'lsqr'}
Mean CV Score: 0.8271
Mean Train Score: 0.8345
CV Score / Train Score Ratio: 0.9912

Parameters:  {'alpha': 0.1, 'solver': 'sag'}
Mean CV Score: 0.8182
Mean Train Score: 0.8232
CV Score / Train Score Ratio: 0.9939

Parameters:  {'alpha': 1.0, 'solver': 'svd'}
Mean CV Score: 0.8305
Mean Train Score: 0.8361
CV Score / Train Score Ratio: 0.9933

Parameters:  {'alpha': 1.0, 'solver': 'cholesky'}
Mean CV Score: 0.8305
Mean Train Score: 0.8361
CV Score / Train Score Ratio: 0.9933

Parameters:  {'alpha': 1.0, 'solver': 'lsqr'}
Mean CV 

In [46]:
test_predictions = grid_search.predict(test)


In [47]:
test

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Age*Class,Age*Fare,Titel_Master,Titel_Miss,Titel_Mr,Titel_Mrs,Titel_Rare,class_2,class_3,Embarked_1,Embarked_2
0,0,34.500000,0,0,2.178064,8,103.500000,75.143222,0.0,0.0,1.0,0.0,0.0,0,1,1,0
1,1,47.000000,1,0,2.079442,8,141.000000,97.733752,0.0,0.0,0.0,1.0,0.0,0,1,0,1
2,0,62.000000,0,0,2.369075,8,124.000000,146.882640,0.0,0.0,1.0,0.0,0.0,1,0,1,0
3,0,27.000000,0,0,2.268252,8,81.000000,61.242815,0.0,0.0,1.0,0.0,0.0,0,1,0,1
4,1,22.000000,1,1,2.586824,8,66.000000,56.910122,0.0,0.0,0.0,1.0,0.0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,28.724891,0,0,2.202765,8,86.174672,63.274177,0.0,0.0,1.0,0.0,0.0,0,1,0,1
414,1,39.000000,0,0,4.699571,2,39.000000,183.283264,0.0,0.0,0.0,0.0,1.0,0,0,0,0
415,0,38.500000,0,0,2.110213,8,115.500000,81.243208,0.0,0.0,1.0,0.0,0.0,0,1,0,1
416,0,28.724891,0,0,2.202765,8,86.174672,63.274177,0.0,0.0,1.0,0.0,0.0,0,1,0,1


In [48]:
test_predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [49]:
submission_df = pd.DataFrame({
    'PassengerId': test1['PassengerId'],  # Or whatever your identifier column is named
    'Survived': test_predictions
})
submission_df.to_csv('submission.csv', index=False)