In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import joblib

# Load preprocessed dataset
df = pd.read_csv('preprocessed_titanic.csv')

# Feature Engineering: Adding more useful features
df['is_alone'] = (df['family_size'] == 1).astype(int)
df['fare_per_person'] = df['fare'] / df['family_size']

# Encode categorical columns
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = encoder.fit_transform(df[['sex', 'embarked']])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

# Drop original categorical columns and add encoded ones
df = df.drop(columns=['sex', 'embarked'])
df = pd.concat([df, encoded_df], axis=1)

# Select features and target variable
X = df[['pclass', 'age', 'fare', 'family_size', 'fare_per_person', 'is_alone'] + list(encoded_df.columns)]
y = df['survived']

# Normalize numerical features
scaler = StandardScaler()
X[['age', 'fare', 'fare_per_person']] = scaler.fit_transform(X[['age', 'fare', 'fare_per_person']])

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Trying GradientBoostingClassifier for better performance
model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred_best = model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

# Save optimized model
joblib.dump(model, 'optimized_titanic_model.pkl')

# Print evaluation metrics
print(f'Optimized Accuracy: {accuracy_best:.4f}')
print(f'Optimized Precision: {precision_best:.4f}')
print(f'Optimized Recall: {recall_best:.4f}')
print(f'Optimized F1 Score: {f1_best:.4f}')
print("Optimized model saved as optimized_titanic_model.pkl")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['age', 'fare', 'fare_per_person']] = scaler.fit_transform(X[['age', 'fare', 'fare_per_person']])


Optimized Accuracy: 0.8249
Optimized Precision: 0.8254
Optimized Recall: 0.7222
Optimized F1 Score: 0.7704
Optimized model saved as optimized_titanic_model.pkl
