In [1]:
# Standard library imports
import datetime
import json

# Third party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold as SKFold, train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll import scope
from xgboost import XGBClassifier


In [2]:
# global variables

SEED = 42

FOLDS = 5

FILEPATH = "../data/"

In [24]:
test = pd.read_csv(f"{FILEPATH}test.csv")
train = pd.read_csv(f"{FILEPATH}train.csv")

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")

    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")

    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)


analyze_dataframe(train)

In [25]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    df['isSenior'] = df['Age'] > 60
    
    # Create Credit Rating
    bins = [0,100, 300, 580, 670, 740, 800, 850]
    labels = ['Very Poor', 'Poor', 'Fair', 'Good', 'Very Good', 'Exceptional', 'Excellent']
    df['CreditRating'] = pd.cut(df['CreditScore'], bins=bins, labels=labels, include_lowest=True)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [8]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Length,Initial,Vowels,Consonants,Uniqueness,isSenior
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0,14,O,6,8,0.642857,False
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0,13,O,7,6,0.692308,False
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0,5,H,2,3,0.8,False
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,3,K,2,1,1.0,False
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0,9,C,4,5,0.777778,False


In [26]:
def preprocess_data(df):
    # Drop unnecessary columns
    df = df.drop(["Surname", "CustomerId"], axis=1, errors="ignore")
    return df


In [None]:
analyze_dataframe(train)

In [27]:
# FEATURES
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
    "isSenior",
    "CreditRating",
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
]

In [28]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
train_df = preprocess_data(train)

X = train_df.drop('Exited', axis=1)
y = train_df['Exited']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Use ColumnTransformer to apply transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
])

space = {
    #------XGB-------
    'xgb_max_depth': scope.int(hp.quniform('xgb_max_depth', 1, 6, 1)),
    'xgb_learning_rate': hp.loguniform('xgb_learning_rate', np.log(0.001), np.log(0.2)),
    'xgb_min_child_weight': hp.choice('xgb_min_child_weight', np.arange(1, 6,)),
    'xgb_subsample': hp.uniform('xgb_subsample', 0.5, 1.0),
    'xgb_n_estimators': scope.int(hp.quniform('xgb_n_estimators', 100, 1000, 1)),
    'xgb_colsample_bytree':  hp.uniform('xgb_colsample_bytree', 0.5, 1),
    'xgb_reg_alpha': scope.int(hp.uniform('xgb_reg_alpha', 0, 10)),
    'xgb_gamma': hp.loguniform('xgb_gamma', -10, 10),

    #-----RF---------
    'rf_n_estimators': scope.int(hp.quniform('rf_n_estimators', 100, 1000, 1)),
    'rf_max_depth': scope.int(hp.quniform('rf_max_depth', 1, 10, 1)),
    'rf_min_samples_split': hp.choice('rf_min_samples_split', np.arange(2, 11)),
    'rf_min_samples_leaf': hp.choice('rf_min_samples_leaf', np.arange(2, 11)),
}

def objective(params):

    xgb_clf = XGBClassifier(
        max_depth=params['xgb_max_depth'],
        learning_rate=params['xgb_learning_rate'],
        min_child_weight=params['xgb_min_child_weight'],
        subsample=params['xgb_subsample'],
        n_estimators=params['xgb_n_estimators'],
        colsample_bytree=params['xgb_colsample_bytree'],
        reg_alpha=params['xgb_reg_alpha'],
        gamma=params['xgb_gamma'],
        random_state=SEED
        )
    
    
    rf_clf = RandomForestClassifier(
        n_estimators=params['rf_n_estimators'],
        max_depth=params['rf_max_depth'],
        min_samples_split=params['rf_min_samples_split'],
        min_samples_leaf=params['rf_min_samples_leaf'],
        n_jobs=-1,
        random_state=SEED
        )
    

    stacking_clf = StackingClassifier(estimators=[
        ('xgb', xgb_clf),
        ('rf', rf_clf)
    ],
    final_estimator=LogisticRegression(),
    cv=5 
    )

    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', stacking_clf)])

    # cross-validation strategy 
    scores = cross_val_score(pipeline, X, y, cv=SKFold(FOLDS), scoring='roc_auc')
    print("Cross-validation scores:", scores)
    return {'loss': -np.mean(scores), 'status': STATUS_OK}
    
# Hyperopt optimization
trials = Trials()
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=1, trials=trials)


print("Best Hyperopt Results:f{best_params}")
print("writing to json file....")

def write_best_params_to_json(best_params):
    # Function to convert NumPy types to Python native types for JSON serialization
    def convert_types(obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return obj

    # Convert the best parameters using the conversion function
    best_params_converted = {k: convert_types(v) for k, v in best_params.items()}

    # Write best results to JSON file
    with open('hyperopt_results.json', 'w') as fp:
        json.dump(best_params_converted, fp, indent=4, sort_keys=True)

# Call the function with the best_params dictionary
write_best_params_to_json(best_params)


  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

Cross-validation scores:                             
[0.87921879 0.87397919 0.87650899 0.87550835 0.87449643]
100%|██████████| 1/1 [01:43<00:00, 103.54s/trial, best loss: -0.8759423508641557]
Best Hyperopt Results:f{best_params}
writing to json file....


In [31]:


best_params_formatted = {
    "xgb_max_depth": int(best_params["xgb_max_depth"]),
    "xgb_learning_rate": best_params["xgb_learning_rate"],
    "xgb_min_child_weight": int(best_params["xgb_min_child_weight"]),
    "xgb_subsample": best_params["xgb_subsample"],
    "xgb_n_estimators": int(best_params["xgb_n_estimators"]),
    "xgb_colsample_bytree": best_params["xgb_colsample_bytree"],
    "xgb_reg_alpha": best_params["xgb_reg_alpha"],
    "xgb_gamma": best_params["xgb_gamma"],
    
    "rf_n_estimators": int(best_params["rf_n_estimators"]),
    "rf_max_depth": int(best_params["rf_max_depth"]),
    "rf_min_samples_split": int(best_params["rf_min_samples_split"]),
    "rf_min_samples_leaf": int(best_params["rf_min_samples_leaf"]),
}


xgb_clf = XGBClassifier(
    max_depth=best_params_formatted["xgb_max_depth"],
    learning_rate=best_params_formatted["xgb_learning_rate"],
    min_child_weight=best_params_formatted["xgb_min_child_weight"],
    subsample=best_params_formatted["xgb_subsample"],
    n_estimators=best_params_formatted["xgb_n_estimators"],
    colsample_bytree=best_params_formatted["xgb_colsample_bytree"],
    reg_alpha=best_params_formatted["xgb_reg_alpha"],
    gamma=best_params_formatted["xgb_gamma"],
    random_state=SEED,
)

rf_clf = RandomForestClassifier(
    n_estimators=best_params_formatted["rf_n_estimators"],
    max_depth=best_params_formatted["rf_max_depth"],
    min_samples_split=best_params_formatted["rf_min_samples_split"],
    min_samples_leaf=best_params_formatted["rf_min_samples_leaf"],
    random_state=SEED,
)

stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_clf),
        ('rf', rf_clf)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

final_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", stacking_clf)])
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED)

# Train the model on the training set
final_pipeline.fit(X_train, y_train)

# Predict probabilities on the validation set
y_val_pred = final_pipeline.predict_proba(X_val)[:, 1]

# Calculate the AUC score on the validation set
val_auc_score = roc_auc_score(y_val, y_val_pred)
print(f"Validation AUC: {val_auc_score:.6f}")

Validation AUC: 0.876633


In [None]:
test_pred_prob = final_pipeline.predict_proba(test)[:, 1]

submission_df = pd.DataFrame({"id": test["id"], "Exited": test_pred_prob})
submission_df.to_csv("submission.csv", index=False)
submission_df.head()
