In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
import ydata_profiling as ydp

In [11]:
#global variables

TARGET = 'Exited'

SEED = 42

FOLDS = 5

VER = 1

FILEPATH = '../data/'

In [12]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

In [13]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [14]:

def preprocess_data(df, cat_features, num_features, scaler):
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname','CustomerId'], axis=1, errors='ignore')
    df = df.dropna()
    df = df.drop_duplicates()

    return df

In [15]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
]


In [33]:
xgb_params = {
    "colsample_bytree": 0.7848336902999598,
    "gamma": 0.11104155666381403,
    "learning_rate": 0.37051256533289206,
    "max_depth": 2,
    "min_child_weight": 14.552961522899192,
    "n_estimators": 860,
    "reg_alpha": 5.4576130010073385,
    "reg_lambda": 6.580967199150385,
    "subsample": 0.9486302751512025,
}

cat_params = {
    "bagging_temperature": 0.3369838478985723,
    "depth": 3,
    "iterations": 335,
    "learning_rate": 0.10048364262308962,
    "random_strength": 74,
}
rf_params = {
    "bootstrap": 0,
    "criterion": 'log_loss',
    "max_depth": 11,
    "max_features": 'sqrt',
    "min_samples_leaf": 3,
    "min_samples_split": 7,
    "n_estimators": 9,
}


In [34]:

# Initialize StandardScaler
scaler = StandardScaler()

# Preprocess the test data
train_df = preprocess_data(train, cat_features, num_features, scaler)
test_df = preprocess_data(test, cat_features, num_features, scaler)
X_test = test_df.drop(['id'], axis=1)  # Assuming 'id' is in your DataFrame

# Split the training data
X_train = train_df.drop(['Exited', 'id'], axis=1) 
y_train = train_df['Exited']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(**xgb_params,objective='binary:logistic', seed=SEED)
xgb_model.fit(X_train, y_train)

# Train CatBoost model
cat_model = CatBoostClassifier(**cat_params,random_seed=SEED, verbose=False)
cat_model.fit(X_train, y_train)

# Train LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=SEED)
lgb_model.fit(X_train, y_train)

# Train RF model
rf_model = RandomForestClassifier(**rf_params,random_state=SEED, verbose=False)
rf_model.fit(X_train, y_train)

# Ensemble predictions
xgb_pred = xgb_model.predict_proba(X_val)[:, 1]
#cat_pred = cat_model.predict_proba(X_val)[:, 1]
lgb_pred = lgb_model.predict_proba(X_val)[:, 1]
rf_pred = rf_model.predict_proba(X_val)[:, 1]

# Simple averaging ensemble
ensemble_pred = (xgb_pred + rf_pred + lgb_pred) / 3

# Evaluate ensemble model
ensemble_auc_score = roc_auc_score(y_val, ensemble_pred)
print(f'Ensemble ROC AUC Score: {ensemble_auc_score}')



# Predictions for the test dataset using ensemble
test_xgb_pred = xgb_model.predict_proba(X_test)[:, 1]
test_cat_pred = cat_model.predict_proba(X_test)[:, 1]
test_lgb_pred = lgb_model.predict_proba(X_test)[:, 1]
test_rf_pred = rf_model.predict_proba(X_test)[:, 1]

ensemble_test_pred = (test_xgb_pred + test_rf_pred + test_lgb_pred) / 3
# Calculate predictions for the validation dataset
ensemble_pred_binary = np.where(ensemble_pred > 0.5, 1, 0)

# Create the confusion matrix
confusion_mat = confusion_matrix(y_val, ensemble_pred_binary)
print("Confusion Matrix:")
print(confusion_mat)


[LightGBM] [Info] Number of positive: 24543, number of negative: 90980
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1001
[LightGBM] [Info] Number of data points in the train set: 115523, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212451 -> initscore=-1.310213
[LightGBM] [Info] Start training from score -1.310213




Ensemble ROC AUC Score: 0.8919079392823454
Confusion Matrix:
[[37413  1720]
 [ 4823  5555]]


In [32]:
test_pred_prob = ensemble_test_pred 
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.04302
1,165035,0.848069
2,165036,0.031819
3,165037,0.251908
4,165038,0.35933
