In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import re,os
import xgboost as xgb
from scipy.stats import skew


In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
train.head()

In [None]:
print('Number of Training Examples = {}'.format(train.shape[0]))
print('Number of Test Examples = {}\n'.format(test.shape[0]))
print('Training X Shape = {}'.format(train.shape))
print('Training y Shape = {}\n'.format(train['Survived'].shape[0]))
print('Test X Shape = {}'.format(test.shape))
print('Test y Shape = {}\n'.format(test.shape[0]))
print(train.columns)
print(test.columns)

In [None]:
train.sample(15)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ProbabilityEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.probabilities = None

    def fit(self, X, y=None):
        total_count = X.value_counts().sum()
        self.probabilities = (X.value_counts() / total_count).to_dict()
        return self

    def transform(self, X):
        X_encoded = X.map(self.probabilities)
        return X_encoded

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
PassengerId = test['PassengerId']

def extract_titles(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

def clean_and_combine_titles(title):
    # Define a dictionary to map similar titles to a common title
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Mlle': 'Miss', 'Dr': 'Noble', 'Rev': 'Noble', 'Don': 'Noble', 'Major': 'Noble', 'Lady': 'Noble',
        'Sir': 'Noble', 'Col': 'Noble', 'Capt': 'Noble', 'Countess': 'Noble', 'Jonkheer': 'Noble', 'Dona': 'Noble'
    }
#     title2 = title_mapping.get(title, 'Other')
#     title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Noble": 4}
    return title_mapping.get(title, 'Other')

def impute_age(row):
    if np.isnan(row['Age']):
        return title_means[row['title']]
    else:
        return row['Age']
    

data_dict = {"train": train, "test": test}
for key in data_dict.keys():
    data_dict[key]['title'] = data_dict[key]['Name'].apply(extract_titles)
    data_dict[key]['title'] = data_dict[key]['title'].apply(clean_and_combine_titles)
    data_dict[key]['Embarked'] = data_dict[key]['Embarked'].fillna("Other")

    if key == "train":
        pe_title = ProbabilityEncoder()
        pe_embarked = ProbabilityEncoder()
        pe_sex = ProbabilityEncoder()
        data_dict[key]['title'] = pe_title.fit_transform(data_dict[key]['title'])
        data_dict[key]['Embarked'] = pe_embarked.fit_transform(data_dict[key]['Embarked'])
#         data_dict[key]['Sex'] = pe_sex.fit_transform(data_dict[key]['Sex'])
    else:
        data_dict[key]['title'] = pe_title.transform(data_dict[key]['title'])
        data_dict[key]['Embarked'] = pe_embarked.transform(data_dict[key]['Embarked'])
#         data_dict[key]['Sex'] = pe_sex.transform(data_dict[key]['Sex'])

    data_dict[key]['Has_Cabin'] = data_dict[key]["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    data_dict[key]['FamilySize'] = data_dict[key]['SibSp'] + data_dict[key]['Parch'] + 1
    data_dict[key]['IsAlone'] = 0
    data_dict[key].loc[data_dict[key]['FamilySize'] == 1, 'IsAlone'] = 1
    data_dict[key]['Sex'] = data_dict[key]['Sex'].map({'female': 0, 'male': 1}).astype(int)
    


full_data = pd.concat([train, test])
title_means = full_data.groupby('title')['Age'].mean()
full_data['Age'] = full_data.apply(impute_age, axis=1)
full_data['Fare'] = full_data['Fare'].fillna(train['Fare'].median())

# bin_edges = [0, 12, 19, 39, 59, 100]
# bin_numerical_codes = [0, 1, 2, 3, 4]

# # Create a new column 'AgeGroupCode' based on age bins
# # full_data['AgeGroupCode'] = pd.cut(full_data['Age'], bins=bin_edges, labels=bin_numerical_codes, include_lowest=True, right=False).astype(int)
# full_data['AgeGroupCode'] = pd.qcut(full_data['Age'], q=10, labels=False)

# # Use qcut to create fare groups
# full_data['FareGroupCode'] = pd.qcut(full_data['Fare'], q=10, labels=False)

full_data.loc[ full_data['Fare'] <= 7.91, 'Fare'] 						        = 0
full_data.loc[(full_data['Fare'] > 7.91) & (full_data['Fare'] <= 14.454), 'Fare'] = 1
full_data.loc[(full_data['Fare'] > 14.454) & (full_data['Fare'] <= 31), 'Fare']   = 2
full_data.loc[ full_data['Fare'] > 31, 'Fare'] 							        = 3
full_data['Fare'] = full_data['Fare'].astype(int)

# Mapping Age
full_data.loc[ full_data['Age'] <= 16, 'Age'] 					       = 0
full_data.loc[(full_data['Age'] > 16) & (full_data['Age'] <= 32), 'Age'] = 1
full_data.loc[(full_data['Age'] > 32) & (full_data['Age'] <= 48), 'Age'] = 2
full_data.loc[(full_data['Age'] > 48) & (full_data['Age'] <= 64), 'Age'] = 3
full_data.loc[ full_data['Age'] > 64, 'Age'] = 4 ;

full_data = full_data.drop(['Cabin', 'SibSp', 'Parch', 'Ticket', 'Name'], axis=1)

In [None]:
full_data

In [None]:
test_df = full_data[full_data['PassengerId'].isin(PassengerId)]
test_df.drop(["Survived"], axis=1, inplace=True)
X = full_data[~full_data['PassengerId'].isin(PassengerId)]
y = X['Survived']
features = [col for col in X.columns if col != "PassengerId"]

In [None]:
correlation_matrix = X[features].corr()

# Create a heatmap to visualize the correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Matrix for X')
plt.show()


In [None]:
# Set up the figure and axes for subplots
num_cols = len(X.columns)
num_rows = (num_cols + 1) // 2  # Ensure enough rows for all columns
fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(14, 6 * num_rows))

# Iterate through all columns
for i, col in enumerate(X.columns):
    row_idx = i // 2
    col_idx = i % 2
    ax = axes[row_idx, col_idx]

    # Create a histogram for the current column
    sns.histplot(data=X, x=col, bins=50, kde=True, ax=ax)
    ax.set_xlabel(col)
    ax.set_ylabel("Value")
    ax.set_title(f"Histogram of {col}")

    # Calculate skewness for the current column
    skewness = skew(X[col])
    print(f"Skewness {col}:", skewness)

# Adjust layout and spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
X.drop(columns=["Survived"], inplace=True)

In [None]:
lgbm_params = {
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": -1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "n_estimators": 1000,
    "seed": 42
}

xgb_params = {
    "verbosity": 0,
    "booster": "gbtree",
    "eta": 0.05,
    "max_depth": None,  # Equivalent to no limit in XGBoost
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "n_estimators": 1000,
    "seed": 42
}

features = [col for col in X.columns if col != "PassengerId"]


In [None]:
X[features]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size=0.1, random_state=42)

# Create a LightGBM classifier
clf = lgb.LGBMClassifier(**lgbm_params)

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


lgb.plot_importance(clf, height=0.5, figsize=(8, 6))
plt.show()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size=0.1, random_state=42)

# Create an XGBoost classifier
clf = xgb.XGBClassifier()

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

xgb.plot_importance(clf, height=0.5)
plt.show()

In [None]:
test_df['Survived'] = clf.predict(test_df[features]).astype(int)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': test_df.Survived})
output.to_csv("my_submission.csv", index=False)

In [None]:
output