In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling as ydp


In [None]:
#global variables

TARGET = 'Exited'

SEED = 42

FOLDS = 5

VER = 1

FILEPATH = '../data/'

In [None]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

In [None]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [None]:

def preprocess_data(df, cat_features, num_features, scaler):
    
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname','CustomerId'], axis=1, errors='ignore')
    df = df.dropna()
    df = df.drop_duplicates()

    return df

In [None]:
cat_features = [
    "Geography",
    "Gender",
    "HasCrCard",
    "IsActiveMember",
    "NumOfProducts",
    "Initial",
]
num_features = [
    "CreditScore",
    "Age",
    "Tenure",
    "EstimatedSalary",
    "Uniqueness",
    "Vowels",
    "Consonants",
    "Length",
]


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import numpy as np
from sklearn.metrics import confusion_matrix

# Initialize StandardScaler
scaler = StandardScaler()

# Assuming 'train' and 'test' are your DataFrame names
train_df = preprocess_data(train, cat_features, num_features, scaler)

# Split the training data
X_train = train_df.drop(['Exited', 'id'], axis=1) 
y_train = train_df['Exited']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=SEED)
xgb_model.fit(X_train, y_train)

# Train CatBoost model
cat_model = CatBoostClassifier(random_seed=SEED, verbose=False)
cat_model.fit(X_train, y_train)

# Train LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=SEED)
lgb_model.fit(X_train, y_train)

# Ensemble predictions
xgb_pred = xgb_model.predict_proba(X_val)[:, 1]
cat_pred = cat_model.predict_proba(X_val)[:, 1]
lgb_pred = lgb_model.predict_proba(X_val)[:, 1]

# Simple averaging ensemble
ensemble_pred = (xgb_pred + cat_pred + lgb_pred) / 3

# Evaluate ensemble model
ensemble_auc_score = roc_auc_score(y_val, ensemble_pred)
print(f'Ensemble ROC AUC Score: {ensemble_auc_score}')

# Preprocess the test data
test_df = preprocess_data(test, cat_features, num_features, scaler)
X_test = test_df.drop(['id'], axis=1)  # Assuming 'id' is in your DataFrame

# Predictions for the test dataset using ensemble
test_xgb_pred = xgb_model.predict_proba(X_test)[:, 1]
test_cat_pred = cat_model.predict_proba(X_test)[:, 1]
test_lgb_pred = lgb_model.predict_proba(X_test)[:, 1]

ensemble_test_pred = (test_xgb_pred + test_cat_pred + test_lgb_pred) / 3

# Calculate predictions for the validation dataset
ensemble_pred_binary = np.where(ensemble_pred > 0.5, 1, 0)

# Create the confusion matrix
confusion_mat = confusion_matrix(y_val, ensemble_pred_binary)
print("Confusion Matrix:")
print(confusion_mat)


In [None]:
test_pred_prob = ensemble_test_pred 
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()