In [20]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling as ydp


In [21]:
#global variables

TARGET = 'Exited'

SEED = 42

FOLDS = 5

FILEPATH = '../data/'

In [94]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
# train['isTrain'] = 1
# test['isTrain'] = 0
#train = pd.concat([train, test, original]).reset_index(drop=True).copy()
# concating train and original data
train = pd.concat([train, original]).reset_index(drop=True).copy()

In [95]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

DataFrame Information:
______________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175036 entries, 0 to 175035
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  float64
 1   CustomerId       175036 non-null  int64  
 2   Surname          175036 non-null  object 
 3   CreditScore      175036 non-null  int64  
 4   Geography        175035 non-null  object 
 5   Gender           175036 non-null  object 
 6   Age              175035 non-null  float64
 7   Tenure           175036 non-null  int64  
 8   Balance          175036 non-null  float64
 9   NumOfProducts    175036 non-null  int64  
 10  HasCrCard        175035 non-null  float64
 11  IsActiveMember   175035 non-null  float64
 12  EstimatedSalary  175036 non-null  float64
 13  Exited           175036 non-null  int64  
dtypes: float64(6), int64(5), object(3)
memory usage: 18.7+ MB


None



DataFrame Head:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0.0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1.0,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2.0,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3.0,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4.0,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0




DataFrame Tail:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
175031,,15584532,Liu,709,France,Female,36.0,7,0.0,1,0.0,1.0,42085.58,1
175032,,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
175033,,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
175034,,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0
175035,,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0




DataFrame Description:
______________________


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,165034.0,82516.5,47641.3565,0.0,41258.25,82516.5,123774.75,165033.0
CustomerId,175036.0,15691940.0,71428.662023,15565701.0,15632882.0,15690169.0,15756655.0,15815690.0
CreditScore,175036.0,656.1173,81.15183,350.0,597.0,659.0,710.0,850.0
Age,175035.0,38.17139,8.969523,18.0,32.0,37.0,42.0,92.0
Tenure,175036.0,5.019904,2.811125,0.0,3.0,5.0,7.0,10.0
Balance,175036.0,56678.82,62982.46607,0.0,0.0,0.0,120729.77,250898.09
NumOfProducts,175036.0,1.553069,0.54921,1.0,1.0,2.0,2.0,4.0
HasCrCard,175035.0,0.7511869,0.432327,0.0,1.0,1.0,1.0,1.0
IsActiveMember,175035.0,0.4987517,0.5,0.0,0.0,0.0,1.0,1.0
EstimatedSalary,175036.0,111861.0,50815.418008,11.58,73181.39,116969.73,154767.34,199992.48




Number of Null Values:
______________________


id                 10002
CustomerId             0
Surname                0
CreditScore            0
Geography              1
Gender                 0
Age                    1
Tenure                 0
Balance                0
NumOfProducts          0
HasCrCard              1
IsActiveMember         1
EstimatedSalary        0
Exited                 0
dtype: int64



Number of Duplicated Rows:
______________________


2



Number of Unique Values:
______________________


id                 165034
CustomerId          23421
Surname              2932
CreditScore           460
Geography               3
Gender                  2
Age                    73
Tenure                 11
Balance             30239
NumOfProducts           4
HasCrCard               2
IsActiveMember          2
EstimatedSalary     55581
Exited                  2
dtype: int64



DataFrame Shape:
______________________
Rows: 175036, Columns: 14


DataFrame Columns:
______________________


Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [100]:
# Function to calculate vowel and consonant count
def vowel_consonant_count(word):
    vowels = "aeiouAEIOU"
    vowel_count = sum(1 for char in word if char in vowels)
    consonant_count = sum(1 for char in word if char not in vowels and char.isalpha())
    return vowel_count, consonant_count

def create_surname_features(df):
    df['Length'] = df['Surname'].apply(len)
    df['Initial'] = df['Surname'].str[0]
    df[['Vowels', 'Consonants']] = df['Surname'].apply(lambda x: vowel_consonant_count(x)).tolist()
    df['Uniqueness'] = df['Surname'].apply(lambda x: len(set(x.lower())) / len(x) if x else 0)
    return df

train = create_surname_features(train)
test = create_surname_features(test)

In [101]:

def preprocess_data(df, cat_features, num_features, scaler):
    
    
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname','CustomerId'], axis=1, errors='ignore')
    df = df.dropna()
    df = df.drop_duplicates()

    return df

In [99]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

cat_features = ['Geography','Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts', 'Initial']
num_features = ['CreditScore', 'Age', 'Tenure', 'EstimatedSalary','Uniqueness', 'Vowels', 'Consonants', 'Length']


scaler = StandardScaler()


train_df = preprocess_data(train, cat_features, num_features, scaler)

# Split the training data
X_train = train_df.drop(['Exited', 'id'], axis=1) 
y_train = train_df['Exited']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

# Train the XGBoost model
model = xgb.XGBClassifier(objective='binary:logistic', seed=SEED)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_prob = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_pred_prob)
print(f'ROC AUC Score: {auc_score}')

# Preprocess the test data (Do not fit the scaler again, just transform)
test_df = preprocess_data(test, cat_features, num_features, scaler)
X_test = test_df.drop(['id'], axis=1)  # Assuming 'id' is in your DataFrame

# Predict probabilities for the test dataset
test_pred_prob = model.predict_proba(X_test)[:, 1]

ROC AUC Score: 0.8894567070704424


In [93]:
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.052252
1,165035,0.897484
2,165036,0.014657
3,165037,0.309647
4,165038,0.310862
