In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold as SKF
import optuna

In [None]:
#global variables

TARGET = 'Exited'

SEED = 42

FOLDS = 5

FILEPATH = '../data/'

In [38]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

In [39]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [40]:
def preprocess_data_for_catboost(df, cat_features, num_features, scaler=None):
    # Normalize numerical features if scaler is provided
    if scaler is not None:
        df[num_features] = scaler.transform(df[num_features])

    # Convert categorical features to string type
    for col in cat_features:
        df[col] = df[col].astype(str)

    # Drop unnecessary columns
    df = df.drop(['Surname', 'CustomerId'], axis=1, errors='ignore')
    df = df.dropna()
    df = df.drop_duplicates()
    return df

In [41]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
]

In [42]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler only on training data
train[num_features] = scaler.fit_transform(train[num_features])

# Preprocess data
train_df = preprocess_data_for_catboost(train, cat_features, num_features)
test_df = preprocess_data_for_catboost(test, cat_features, num_features, scaler)

In [None]:
# Define the hyperparameter space
from hyperopt import hp, tpe, Trials, fmin
space = {
    'iterations': hp.choice('iterations', range(50, 400)),
    'depth': hp.choice('depth', range(4, 11)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'random_strength': hp.randint('random_strength', 100),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0)
}

# Define the objective function
def objective(params):
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(train_df.drop([TARGET, 'id'], axis=1, errors='ignore'), train_df[TARGET], test_size=0.3, random_state=42)

    # Initialize and train the CatBoost Classifier
    cat_model = CatBoostClassifier(**params, cat_features=cat_features, early_stopping_rounds=50, eval_metric='AUC', random_state=42, verbose=0,task_type="GPU",devices='0:1')
    cat_model.fit(X_train, y_train)

    # Predict and calculate ROC AUC score
    y_pred_prob = cat_model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, y_pred_prob)
    return -score  # Hyperopt minimizes the objective, so negate the score

# Run the optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=5,  # Adjust the number of evaluations
    trials=Trials()
)

print("Best hyperparameters:", best)

# Catboost tuning with hyperopt

In [43]:
from hyperopt import hp, tpe, Trials, fmin
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
# Define the hyperparameter space
space = {
    'iterations': hp.choice('iterations', range(50, 400)),
    'depth': hp.choice('depth', range(4, 11)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'random_strength': hp.randint('random_strength', 100),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0)
}

# Define the objective function with SKF
def objective(params):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    accuracy_scores = []

    for train_idx, val_idx in skf.split(train_df.drop([TARGET, 'id'], axis=1, errors='ignore'), train_df[TARGET]):
        X_train, X_val = train_df.iloc[train_idx].drop([TARGET, 'id'], axis=1), train_df.iloc[val_idx].drop([TARGET, 'id'], axis=1)
        y_train, y_val = train_df.iloc[train_idx][TARGET], train_df.iloc[val_idx][TARGET]

        cat_model = CatBoostClassifier(**params, cat_features=cat_features, early_stopping_rounds=50, eval_metric='AUC', random_state=42, verbose=0)
        cat_model.fit(X_train, y_train)

        y_pred_prob = cat_model.predict_proba(X_val)[:, 1]
        y_pred = cat_model.predict(X_val)

        auc_score = roc_auc_score(y_val, y_pred_prob)
        accuracy = accuracy_score(y_val, y_pred)
        auc_scores.append(auc_score)
        accuracy_scores.append(accuracy)

        print(f"Fold ROC AUC: {auc_score}, Accuracy: {accuracy}")

    average_auc_score = -np.mean(auc_scores)  # Negative because Hyperopt minimizes
    average_accuracy = np.mean(accuracy_scores)

    print(f"Average ROC AUC: {-average_auc_score}, Average Accuracy: {average_accuracy}")

    return average_auc_score  # Or you can return a combination or just accuracy

# Run the optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=300,
    trials=Trials()
)

print("Best hyperparameters:", best)


Fold ROC AUC: 0.8790927233548224, Accuracy: 0.8601811736904293
Fold ROC AUC: 0.8798884304874544, Accuracy: 0.8596055382191656
Fold ROC AUC: 0.8830633633239854, Accuracy: 0.8625746053867361
Fold ROC AUC: 0.8821285023569342, Accuracy: 0.8634229102917563
Fold ROC AUC: 0.8764562447732349, Accuracy: 0.8608434830030903
Average ROC AUC: 0.8801258528592862, Average Accuracy: 0.8613255421182355
Fold ROC AUC: 0.890116702318889, Accuracy: 0.8644226982155301                         
Fold ROC AUC: 0.8902905368891446, Accuracy: 0.8645741812342836                        
Fold ROC AUC: 0.8915440182024191, Accuracy: 0.8676644348168571                        
Fold ROC AUC: 0.89159049220342, Accuracy: 0.8672705789680977                          
Fold ROC AUC: 0.8885594772306066, Accuracy: 0.8660546567290796                        
Average ROC AUC: 0.890420245368896, Average Accuracy: 0.8659973099927696              
Fold ROC AUC: 0.8782693372753919, Accuracy: 0.8575453691641167                        
Fo

In [44]:

# Prepare full training data
X_full_train = train_df.drop([TARGET, 'id'], axis=1, errors='ignore')
y_full_train = train_df[TARGET]

# Initialize the CatBoost model with the best hyperparameters
full_cat_model = CatBoostClassifier(**best, random_seed=SEED, cat_features=cat_features, eval_metric='AUC',verbose=0)

# Train the model on the full training set
full_cat_model.fit(X_full_train, y_full_train)

<catboost.core.CatBoostClassifier at 0x2531dd14f70>

In [45]:
X_test = test_df.drop(['id'], axis=1, errors='ignore')

test_pred_prob = full_cat_model.predict_proba(X_test)[:, 1]


submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.023142
1,165035,0.82692
2,165036,0.035154
3,165037,0.214574
4,165038,0.388887
