# Titanic Survival
We will load a Dataset of Titanic passenger list and define if they survive or not.

### Dependencies

In [4]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMTunerCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostClassifier


### Importing Dataset

In [6]:
train_df = pd.read_csv('Data-Science/Binary Classification/Scikit-Learn/Titanic Survival/train.csv')
test_df = pd.read_csv('Data-Science/Binary Classification/Scikit-Learn/Titanic Survival/test.csv')

In [7]:
train_df['Cabin'].fillna('0', inplace=True)
test_df['Cabin'].fillna('0', inplace=True)

In [8]:
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)

train_df['Embarked'].fillna('0', inplace=True)
test_df['Embarked'].fillna('0', inplace=True)

train_df['Fare'].fillna(train_df['Fare'].mean(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

### Encoding string variables

In [9]:
label_encoder = LabelEncoder()
train_df['Sex_encoded'] = label_encoder.fit_transform(train_df['Sex'])
train_df['Embarked_encoded'] = label_encoder.fit_transform(train_df['Embarked'])
train_df['Cabin_encoded'] = label_encoder.fit_transform(train_df['Cabin'])
test_df['Sex_encoded'] = label_encoder.fit_transform(test_df['Sex'])
test_df['Embarked_encoded'] = label_encoder.fit_transform(test_df['Embarked'])
test_df['Cabin_encoded'] = label_encoder.fit_transform(test_df['Cabin'])

### Drop variables and divide dataset in features and labels

In [10]:

labels_df=train_df['Survived']
features_df=train_df.loc[:, train_df.columns.isin(['Sex_encoded', 'Pclass', 'Age', 'SibSp', 'Parch'])]
test_features_df=test_df.loc[:, test_df.columns.isin(['Sex_encoded', 'Pclass', 'Age', 'SibSp', 'Parch'])]

scaler = MinMaxScaler()
features_df = pd.DataFrame(scaler.fit_transform(features_df), columns=features_df.columns)
test_features_df = pd.DataFrame(scaler.fit_transform(test_features_df), columns=test_features_df.columns)


In [11]:
# Find NaN values in train_df
nan_counts = train_df.isnull().sum()
print(nan_counts)

PassengerId         0
Survived            0
Pclass              0
Name                0
Sex                 0
Age                 0
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin               0
Embarked            0
Sex_encoded         0
Embarked_encoded    0
Cabin_encoded       0
dtype: int64


In [12]:
nan_counts = test_features_df.isnull().sum()
print(nan_counts)

Pclass         0
Age            0
SibSp          0
Parch          0
Sex_encoded    0
dtype: int64


### Creating the model

In [29]:
# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(features_df, labels_df)

# Get the best estimator
best_xgb_clf = random_search.best_estimator_

# Make predictions on the test dataset
xgb_predictions = best_xgb_clf.predict(test_features_df)




# Add the predictions to the test dataframe
test_df['xgb_survived'] = xgb_predictions

# Display the first few rows of the test dataframe to verify the predictions
test_df.head()

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded,Embarked_encoded,Cabin_encoded,xgb_survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q,1,1,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,S,0,2,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q,1,1,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S,1,2,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S,0,2,0,0


#### Fitting the model

In [30]:
# Get feature importances
feature_importances = best_xgb_clf.feature_importances_

# Create a DataFrame for visualization
feature_importances_df = pd.DataFrame({
    'Feature': features_df.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feature_importances_df)

       Feature  Importance
4  Sex_encoded    0.782069
0       Pclass    0.119407
2        SibSp    0.040237
1          Age    0.031631
3        Parch    0.026655


In [31]:
# Define the dataset for LightGBM
train_data = lgb.Dataset(features_df, labels_df)

# Define the parameter search space
param_grid = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_jobs': -1,
    'seed': 42
}

# Define the tuning function
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'verbosity': -1
    }
    
    cv_results = lgb.cv(
        params,
        train_data,
        nfold=3,
        stratified=True,
        shuffle=True,
        metrics='binary_logloss',
        seed=42,
        eval_train_metric=True,
    )
    return min(cv_results['train binary_logloss-mean'])

# Create the study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params
best_params.update(param_grid)

# Train the final model with the best parameters
lgb_clf = lgb.LGBMClassifier(**best_params)
lgb_clf.fit(features_df, labels_df)

# Make predictions on the test dataset
lgb_predictions = lgb_clf.predict(test_features_df)

# Add the predictions to the test dataframe
test_df['lgb_survived'] = lgb_predictions

# Display the first few rows of the test dataframe to verify the predictions
test_df.head()

[I 2024-10-18 02:13:16,982] A new study created in memory with name: no-name-0246ca7b-cc0d-4e63-8a33-d4612f148b2b
[I 2024-10-18 02:13:17,192] Trial 0 finished with value: 0.5776215044672071 and parameters: {'num_leaves': 31, 'max_depth': 4, 'learning_rate': 0.0017269497317709168, 'n_estimators': 147, 'subsample': 0.623845062913346, 'colsample_bytree': 0.9928278256087459, 'min_child_weight': 9}. Best is trial 0 with value: 0.5776215044672071.
[I 2024-10-18 02:13:20,234] Trial 1 finished with value: 0.4090882922182388 and parameters: {'num_leaves': 123, 'max_depth': 8, 'learning_rate': 0.003309398116817937, 'n_estimators': 492, 'subsample': 0.9070848376721469, 'colsample_bytree': 0.9824295834511557, 'min_child_weight': 5}. Best is trial 1 with value: 0.4090882922182388.
[I 2024-10-18 02:13:21,101] Trial 2 finished with value: 0.42936094682300086 and parameters: {'num_leaves': 54, 'max_depth': 6, 'learning_rate': 0.006077596848299224, 'n_estimators': 237, 'subsample': 0.9252414020554899, 

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded,Embarked_encoded,Cabin_encoded,xgb_survived,lgb_survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q,1,1,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,S,0,2,0,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q,1,1,0,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S,1,2,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S,0,2,0,0,1


#### Making predictions with the model

In [32]:
# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 500, 1000]
}

# Initialize the Logistic Regression classifier
log_reg = LogisticRegression(random_state=42)

# Initialize RandomizedSearchCV
random_search_log_reg = RandomizedSearchCV(estimator=log_reg, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_log_reg.fit(features_df, labels_df)

# Get the best estimator
best_log_reg = random_search_log_reg.best_estimator_

# Make predictions on the test dataset
log_reg_predictions = best_log_reg.predict(test_features_df)

# Add the predictions to the test dataframe
test_df['log_reg_survived'] = log_reg_predictions

# Display the first few rows of the test dataframe to verify the predictions
test_df.head()

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded,Embarked_encoded,Cabin_encoded,xgb_survived,lgb_survived,log_reg_survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q,1,1,0,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,S,0,2,0,0,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q,1,1,0,0,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S,1,2,0,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S,0,2,0,0,1,1


In [33]:
# Define the parameter grid for Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

# Initialize the Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search_dt = RandomizedSearchCV(estimator=dt_clf, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_dt.fit(features_df, labels_df)

# Get the best estimator
best_dt_clf = random_search_dt.best_estimator_

# Make predictions on the test dataset
dt_predictions = best_dt_clf.predict(test_features_df)

# Add the predictions to the test dataframe
test_df['dt_survived'] = dt_predictions

# Display the first few rows of the test dataframe to verify the predictions
test_df.head()

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded,Embarked_encoded,Cabin_encoded,xgb_survived,lgb_survived,log_reg_survived,dt_survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q,1,1,0,0,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,S,0,2,0,0,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q,1,1,0,0,0,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S,1,2,0,0,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S,0,2,0,0,1,1,0


In [34]:
test_df[['PassengerId', 'xgb_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival/XGB_predictions.csv', index=False)
test_df[['PassengerId', 'lgb_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival//LGB_predictions.csv', index=False)
test_df[['PassengerId', 'log_reg_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival/LogReg_predictions.csv', index=False)
test_df[['PassengerId', 'dt_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival/DT_predictions.csv', index=False)

In [35]:
# Create a pool of all predicted loan statuses
test_df['pooled_survived'] = test_df[['xgb_survived', 'lgb_survived', 'log_reg_survived', 'dt_survived']].mode(axis=1)[0]
test_df['pooled_survived'] = test_df['pooled_survived'].astype(int)
# Display the first few rows of the test dataframe to verify the pooled predictions
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded,Embarked_encoded,Cabin_encoded,xgb_survived,lgb_survived,log_reg_survived,dt_survived,pooled_survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q,1,1,0,0,0,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,S,0,2,0,0,0,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q,1,1,0,0,0,0,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S,1,2,0,0,0,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S,0,2,0,0,1,1,0,0


In [36]:
test_df[['PassengerId', 'pooled_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival/Pooled_Predictions.csv', index=False)

In [37]:
# Create a new column 'any_predicted_loan_status' where the value is 1 if any of the predictions is 1
test_df['any_survived'] = test_df[['xgb_survived', 'lgb_survived', 'log_reg_survived', 'dt_survived']].max(axis=1)
# Display the first few rows of the test dataframe to verify the new column
test_df.head()
test_df[['PassengerId', 'any_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival/Max_Predictions.csv', index=False)

In [38]:
# Install TensorFlow Decision Forests
!pip install tensorflow_decision_forests

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.0 kB)
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting tf-keras~=2.17 (from tensorflow_decision_forests)
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Collecting ydf (from tensorflow_decision_forests)
  Downloading ydf-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Downloading tensorflow_decision_forests-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0mta [36m0:

In [42]:
import tensorflow_decision_forests as tfdf
import tensorflow as tf



# Convert the pandas DataFrame to a TensorFlow dataset
train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(train_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_encoded', 'Survived']], label="Survived")
test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(test_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_encoded']], label=None)

# Train a Random Forest model
model = tfdf.keras.RandomForestModel()
model.fit(train_dataset, verbose=0)

# Make predictions on the test dataset
tfdf_predictions = model.predict(test_dataset)

# Convert predictions to binary outcome
tfdf_predictions = (tfdf_predictions >= 0.5).astype(int)
# Add the predictions to the test dataframe
test_df['tfdf_survived'] = tfdf_predictions

# Display the first few rows of the test dataframe to verify the predictions
test_df.head()

Use /tmp/tmp_t64otzr as temporary training directory


I0000 00:00:1729228885.551084 1194904 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1729228885.551142 1194904 kernel.cc:775] Collect training examples
I0000 00:00:1729228885.551153 1194904 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1729228885.551216 1194904 kernel.cc:394] Number of batches: 1
I0000 00:00:1729228885.551220 1194904 kernel.cc:395] Number of examples: 891
I0000 00:00:1729228885.551256 1194904 kernel.cc:794] Training dataset:
Number of records: 891
Number of columns: 6

Number of columns by type:
	NUMERICAL: 5 (83.3333%)
	CATEGORICAL: 1 (16.6667%)

Columns:

NUMERICAL: 5 (83.3333%)
	0: "Age" NUMERICAL mean:29.699



Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Sex_encoded,Embarked_encoded,Cabin_encoded,xgb_survived,lgb_survived,log_reg_survived,dt_survived,pooled_survived,any_survived,tfdf_survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,...,1,1,0,0,0,0,0,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,...,0,2,0,0,0,0,1,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,...,1,1,0,0,0,0,0,0,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,...,1,2,0,0,0,0,0,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,...,0,2,0,0,1,1,0,0,1,0


In [43]:
test_df[['PassengerId', 'tfdf_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival/TFDFpredictions.csv', index=False)

In [44]:
# Install YDF
!pip install ydf -U

[0m

In [51]:
import ydf

train_dataset = train_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_encoded', 'Survived']]
test_dataset = test_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_encoded']]


# Train a Gradient Boosted Trees model
model = ydf.GradientBoostedTreesLearner(label="Survived").train(train_dataset)

# Look at a model (input features, training logs, structure, etc.)
model.describe()

# Evaluate a model (e.g. roc, accuracy, confusion matrix, confidence intervals)
# model.evaluate(test_dataset)

# Generate predictions
ydf_prediction = model.predict(test_dataset)

# Analyse a model (e.g. partial dependence plot, variable importance)
# model.analyze(test_dataset)

# Benchmark the inference speed of a model
model.benchmark(test_dataset)

# Convert predictions to binary outcome
ydf_prediction = (ydf_prediction >= 0.5).astype(int)
# Add the predictions to the test dataframe
test_df['ydf_survived'] = ydf_prediction


Train model on 891 examples


max_vocab_count = -1 for column Survived, the dictionary will not be pruned by size.
"goss_alpha" set but "sampling_method" not equal to "GOSS".
"goss_beta" set but "sampling_method" not equal to "GOSS".
"selective_gradient_boosting_ratio" set but "sampling_method" not equal to "SELGB".
Data spec:
Number of records: 891
Number of columns: 6

Number of columns by type:
	NUMERICAL: 5 (83.3333%)
	CATEGORICAL: 1 (16.6667%)

Columns:

NUMERICAL: 5 (83.3333%)
	1: "Pclass" NUMERICAL mean:2.30864 min:1 max:3 sd:0.835602 dtype:DTYPE_INT64
	2: "Age" NUMERICAL mean:29.6991 min:0.42 max:80 sd:12.9947 dtype:DTYPE_FLOAT64
	3: "SibSp" NUMERICAL mean:0.523008 min:0 max:8 sd:1.10212 dtype:DTYPE_INT64
	4: "Parch" NUMERICAL mean:0.381594 min:0 max:6 sd:0.805605 dtype:DTYPE_INT64
	5: "Sex_encoded" NUMERICAL mean:0.647587 min:0 max:1 sd:0.477722 dtype:DTYPE_INT64

CATEGORICAL: 1 (16.6667%)
	0: "Survived" CATEGORICAL has-dict vocab-size:3 zero-ood-items most-frequent:"0" 549 (61.6162%) dtype:DTYPE_INT64

Te

Model trained in 0:00:00.244178


Early stop of the training because the validation loss does not decrease anymore. Best valid-loss: 0.668338
Truncates the model to 50 tree(s) i.e. 50  iteration(s).
Final model num-trees:50 valid-loss:0.668338 valid-accuracy:0.876712
Engine "GradientBoostedTreesQuickScorerExtended" built


In [52]:
test_df[['PassengerId', 'ydf_survived']].to_csv('Binary Classification/Scikit-Learn/Titanic Survival/YDF_predictions.csv', index=False)

In [15]:
# Define the parameter grid for Decision Tree
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'rsm' : [0.6, 0.8, 1.0],
    
}

# Initialize the Decision Tree classifier
cb_clf = CatBoostClassifier(random_seed=42, verbose=0)

# Initialize RandomizedSearchCV
random_search_cb = RandomizedSearchCV(estimator=cb_clf, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_cb.fit(features_df, labels_df)

# Get the best estimator
best_cb_clf = random_search_cb.best_estimator_

# Make predictions on the test dataset
cb_predictions = best_cb_clf.predict(test_features_df)

# Add the predictions to the test dataframe
test_df['cb_survived'] = cb_predictions

# Display the first few rows of the test dataframe to verify the predictions
test_df.head()

Fitting 3 folds for each of 27 candidates, totalling 81 fits


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encoded,Embarked_encoded,Cabin_encoded,dt_survived,cb_survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,0,Q,1,1,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,0,S,0,2,0,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,0,Q,1,1,0,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,0,S,1,2,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,0,S,0,2,0,1,1


In [16]:
test_df[['PassengerId', 'cb_survived']].to_csv('Data-Science/Binary Classification/Scikit-Learn/Titanic Survival/CB_predictions.csv', index=False)

In [17]:
best_cb_clf.feature_importances_

array([24.09888171, 11.24888752,  5.4252049 ,  2.30161554, 56.92541033])