In [10]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold as SKF

In [14]:
#global variables

TARGET = 'Exited'

SEED = 42

FOLDS = 5

FILEPATH = '../data/'

In [15]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

In [16]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [20]:
def preprocess_data_for_catboost(df, cat_features, num_features, scaler=None):
    # Normalize numerical features if scaler is provided
    if scaler is not None:
        df[num_features] = scaler.transform(df[num_features])

    # Convert categorical features to string type
    for col in cat_features:
        df[col] = df[col].astype(str)

    # Drop unnecessary columns
    df = df.drop(['Surname', 'CustomerId'], axis=1, errors='ignore')
    df = df.dropna()
    df = df.drop_duplicates()
    return df

In [18]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
]

In [22]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler only on training data
train[num_features] = scaler.fit_transform(train[num_features])

# Preprocess data
train_df = preprocess_data_for_catboost(train, cat_features, num_features)
test_df = preprocess_data_for_catboost(test, cat_features, num_features, scaler)

# Split the training data
X_train = train_df.drop([TARGET, 'id'], axis=1, errors='ignore')
y_train = train_df[TARGET]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

# Train the CatBoost model
cat_model = CatBoostClassifier(random_seed=SEED, verbose=1, cat_features=cat_features)
cat_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_prob = cat_model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_pred_prob)
print(f'ROC AUC Score: {auc_score}')

# Predict probabilities for the test dataset
test_pred_prob = cat_model.predict_proba(test_df.drop(['id'], axis=1, errors='ignore'))[:, 1]

Learning rate set to 0.078288
0:	learn: 0.6211793	total: 108ms	remaining: 1m 47s
1:	learn: 0.5637900	total: 211ms	remaining: 1m 45s
2:	learn: 0.5191840	total: 345ms	remaining: 1m 54s
3:	learn: 0.4840704	total: 459ms	remaining: 1m 54s
4:	learn: 0.4565928	total: 566ms	remaining: 1m 52s
5:	learn: 0.4333044	total: 678ms	remaining: 1m 52s
6:	learn: 0.4141948	total: 791ms	remaining: 1m 52s
7:	learn: 0.3988367	total: 926ms	remaining: 1m 54s
8:	learn: 0.3871759	total: 1.07s	remaining: 1m 57s
9:	learn: 0.3773770	total: 1.18s	remaining: 1m 57s
10:	learn: 0.3689913	total: 1.31s	remaining: 1m 57s
11:	learn: 0.3626487	total: 1.43s	remaining: 1m 57s
12:	learn: 0.3572603	total: 1.52s	remaining: 1m 55s
13:	learn: 0.3527062	total: 1.66s	remaining: 1m 57s
14:	learn: 0.3487752	total: 1.77s	remaining: 1m 55s
15:	learn: 0.3454703	total: 1.88s	remaining: 1m 55s
16:	learn: 0.3425228	total: 1.99s	remaining: 1m 55s
17:	learn: 0.3400972	total: 2.12s	remaining: 1m 55s
18:	learn: 0.3378065	total: 2.28s	remaining:

In [None]:
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()