In [218]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# Load data
df = pd.read_csv('/kaggle/input/titanic/train.csv')

# Impute missing values
mean_age = df['Age'].mean()
df['Age'].fillna(mean_age, inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop unnecessary columns
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Encode categorical features
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

# Split data into features and target
targets = df['Survived']
df = df.drop(columns='Survived')

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df, targets, test_size=0.2)

# Oversample using SMOTE
smote = SMOTE()
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# Initialize and train the model
gb = CatBoostClassifier(n_estimators=200, random_state=42, verbose=False, learning_rate=0.01)
gb.fit(x_train_resampled, y_train_resampled)

# Make predictions
gb_pred = gb.predict(x_test)

# Evaluate the model
print(metrics.classification_report(y_test, gb_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       125
           1       0.83      0.72      0.77        54

    accuracy                           0.87       179
   macro avg       0.86      0.83      0.84       179
weighted avg       0.87      0.87      0.87       179



In [222]:
gb_pred = gb.predict(x_test)

# Create a DataFrame with the 'Survived' column
submission_gb = pd.DataFrame({'Survived': gb_pred})

# Save the DataFrame to a CSV file
submission_gb.to_csv('gb_submission.csv', index=False)

print("Gradient Boosting submission was successfully saved!")

Gradient Boosting submission was successfully saved!
