In [165]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


train = pd.read_csv('/kaggle/input/titanic/train.csv')
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
title_age_median = train.groupby('Title')['Age'].median()

#print(title_age_median)
train['Age'] = train.apply(lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)
# Define age bins and labels
age_bins = [0, 5, 12, 18, 30, 45, 60, 75, 100]
age_labels = ['0-5', '5-12', '12-18', '18-30', '30-45', '45-60', '60-75', '76+']

# Create AgeGroup column
train['AgeGroup'] = pd.cut(train['Age'], bins=age_bins, labels=age_labels, right=False)

#Create Family Size
train['Familysize'] = train['SibSp'] + train['Parch'] + 1
train['isalone'] = (train['Familysize'] == 1).astype(int)
train['Sex'] = train['Sex'].map({'male' : 1 , 'female' : 0})

def family_group(size):
    if size == 1:
        return 'Solo'
    elif size <= 3:
        return 'Small'
    elif size <= 5:
        return 'Medium'
    else:
        return 'Large'

#Create Family Group 
train['FamilyGroup'] = train['Familysize'].apply(family_group)

#Create Fare is missing
age_pclass_embarked_fare = train.groupby(['AgeGroup', 'Pclass', 'Embarked'])['Fare'].median().to_dict()
# Fill missing Fare based on AgeGroup, Pclass, and Embarked using the tuple key
train['Fare'] = train.apply(
    lambda row: age_pclass_embarked_fare.get((row['AgeGroup'], row['Pclass'], row['Embarked']), row['Fare']) 
    if pd.isna(row['Fare']) else row['Fare'], axis=1
)

#Fare Bins:
n_bins = 4
train['FareGroup'] = pd.qcut(train['Fare'], q=n_bins, labels=['Low', 'Medium', 'High', 'Expensive'])


X = train.drop(['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket','SibSp','Parch','isalone'], axis=1)
Y = train['Survived']
#print(X.head())

  age_pclass_embarked_fare = train.groupby(['AgeGroup', 'Pclass', 'Embarked'])['Fare'].median().to_dict()


In [173]:
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

#print(title_age_median)
test['Age'] = test.apply(lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)

# Define age bins and labels
age_bins = [0, 5, 12, 18, 30, 45, 60, 75, 100]
age_labels = ['0-5', '5-12', '12-18', '18-30', '30-45', '45-60', '60-75', '76+']

# Create AgeGroup column
test['AgeGroup'] = pd.cut(test['Age'], bins=age_bins, labels=age_labels, right=False)

#Create Family Size
test['Familysize'] = test['SibSp'] + test['Parch'] + 1
test['isalone'] = (test['Familysize'] == 1).astype(int)
test['Sex'] = test['Sex'].map({'male' : 1 , 'female' : 0})

#Create Family Group 
test['FamilyGroup'] = test['Familysize'].apply(family_group)

# Fill missing Fare based on AgeGroup, Pclass, and Embarked using the tuple key
test['Fare'] = test.apply(
    lambda row: age_pclass_embarked_fare.get((row['AgeGroup'], row['Pclass'], row['Embarked']), row['Fare']) 
    if pd.isna(row['Fare']) else row['Fare'], axis=1
)

#Fare Bins:
n_bins = 4
test['FareGroup'] = pd.qcut(test['Fare'], q=n_bins, labels=['Low', 'Medium', 'High', 'Expensive'])

Xt = test.drop(['PassengerId', 'Name', 'Cabin', 'Ticket','SibSp','Parch','isalone'], axis=1)
print(Xt.head())

   Pclass  Sex   Age     Fare Embarked Title AgeGroup  Familysize FamilyGroup  \
0       3    1  34.5   7.8292        Q    Mr    30-45           1        Solo   
1       3    0  47.0   7.0000        S   Mrs    45-60           2       Small   
2       2    1  62.0   9.6875        Q    Mr    60-75           1        Solo   
3       3    1  27.0   8.6625        S    Mr    18-30           1        Solo   
4       3    0  22.0  12.2875        S   Mrs    18-30           3       Small   

  FareGroup  
0       Low  
1       Low  
2    Medium  
3    Medium  
4    Medium  


In [184]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

for col in ['Title','FareGroup', 'Embarked','FamilyGroup','AgeGroup','Fare']:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    Xt[col] = Xt[col].astype('category')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X_encoded, Y, test_size=0.2, random_state=42)

# Random Forest model
rf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=500,
    cv=10,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

# Evaluate
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val)
print("Improved Accuracy:", accuracy_score(y_val, y_pred))


Fitting 10 folds for each of 500 candidates, totalling 5000 fits


In [189]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd


# Identify categorical and numerical columns
categorical_features = ['Title','FareGroup', 'Embarked','FamilyGroup','AgeGroup']
numerical_features = ['Pclass', 'Age', 'Sex', 'Fare', 'Familysize']

# Define preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='rbf', C=1, gamma='scale'))
])


param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 0.01, 0.1, 1],
    'classifier__class_weight': [None, 'balanced']
}


# Train/Val split
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.1, random_state=42)

# Train the model
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Predict
# Now use best estimator (the fitted pipeline)
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_val)

# Evaluate
print("SVM Accuracy with Categorical Handling:", accuracy_score(y_val, y_pred))


Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Params: {'classifier__C': 1, 'classifier__class_weight': None, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}
Best Accuracy: 0.8376940993788822
SVM Accuracy with Categorical Handling: 0.8444444444444444


In [186]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import numpy as np

# Base model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', tree_method='hist', enable_categorical=True)

# Parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=100,  # increase for better results
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit to training data
random_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

# Evaluate
y_pred = best_model.predict(X_val)
print("Improved Accuracy:", accuracy_score(y_val, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 1, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.3, 'gamma': 5, 'colsample_bytree': 1.0}
Improved Accuracy: 0.6703910614525139


In [187]:
print("Model Prediction Starts")
yt = best_pipeline.predict(Xt)
print("Model Prediction Ends")



Model Prediction Starts
Model Prediction Ends


In [188]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': yt
})

submission.to_csv('/kaggle/working/submission3.csv', index=False)
print("File written to output")

File written to output
