In [33]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold as SKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll import scope
import matplotlib.pyplot as plt
import ydata_profiling as ydp  

In [51]:
#global variables

VER = 1

SEED = 42

FOLDS = 5

FILEPATH = 'data/'

In [71]:
test = pd.read_csv(f'{FILEPATH}test.csv')
train = pd.read_csv(f'{FILEPATH}train.csv')
#original = pd.read_csv(f'{FILEPATH}original.csv').drop('RowNumber', axis=1)
# train['isTrain'] = 1
# test['isTrain'] = 0
# tt = pd.concat([train, test]).reset_index(drop=True).copy()

# concating train and original data
#train = pd.concat([train, original]).reset_index(drop=True).copy()

In [None]:
display(train.head())
display(test.head())

In [72]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)

DataFrame Information:
______________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


None



DataFrame Head:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0




DataFrame Tail:
______________________


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.0,1,1.0,1.0,131834.75,0
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.0,1,0.0,0.0,131834.45,0
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.0,1,1.0,1.0,127429.56,0
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.0,1,0.0,1.0,71173.03,0
165033,165033,15732798,Ulyanov,850,France,Male,31.0,1,0.0,1,1.0,0.0,61581.79,1




DataFrame Description:
______________________


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,165034.0,82516.5,47641.3565,0.0,41258.25,82516.5,123774.8,165033.0
CustomerId,165034.0,15692010.0,71397.816791,15565701.0,15633141.0,15690169.0,15756820.0,15815690.0
CreditScore,165034.0,656.4544,80.10334,350.0,597.0,659.0,710.0,850.0
Age,165034.0,38.12589,8.867205,18.0,32.0,37.0,42.0,92.0
Tenure,165034.0,5.020353,2.806159,0.0,3.0,5.0,7.0,10.0
Balance,165034.0,55478.09,62817.663278,0.0,0.0,0.0,119939.5,250898.09
NumOfProducts,165034.0,1.554455,0.547154,1.0,1.0,2.0,2.0,4.0
HasCrCard,165034.0,0.7539537,0.430707,0.0,1.0,1.0,1.0,1.0
IsActiveMember,165034.0,0.4977702,0.499997,0.0,0.0,0.0,1.0,1.0
EstimatedSalary,165034.0,112574.8,50292.865585,11.58,74637.57,117948.0,155152.5,199992.48




Number of Null Values:
______________________


id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64



Number of Duplicated Rows:
______________________


0



Number of Unique Values:
______________________


id                 165034
CustomerId          23221
Surname              2797
CreditScore           457
Geography               3
Gender                  2
Age                    71
Tenure                 11
Balance             30075
NumOfProducts           4
HasCrCard               2
IsActiveMember          2
EstimatedSalary     55298
Exited                  2
dtype: int64



DataFrame Shape:
______________________
Rows: 165034, Columns: 14


DataFrame Columns:
______________________


Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [73]:
from sklearn.preprocessing import StandardScaler

def preprocess_data(df, cat_features, num_features, scaler):
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=cat_features)

    # Normalize numerical features
    df[num_features] = scaler.fit_transform(df[num_features])

    # Drop unnecessary columns
    df = df.drop(['Surname', 'CustomerId'], axis=1, errors='ignore')

    return df

In [74]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

cat_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts']
num_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']


scaler = StandardScaler()


train_df = preprocess_data(train, cat_features, num_features, scaler)

# Split the training data
X_train = train_df.drop(['Exited', 'id'], axis=1) 
y_train = train_df['Exited']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

# Train the XGBoost model
model = xgb.XGBClassifier(objective='binary:logistic', seed=SEED)
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred_prob = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_pred_prob)
print(f'ROC AUC Score: {auc_score}')

# Preprocess the test data (Do not fit the scaler again, just transform)
test_df = preprocess_data(test, cat_features, num_features, scaler)
X_test = test_df.drop(['id'], axis=1)  # Assuming 'id' is in your DataFrame

# Predict probabilities for the test dataset
test_pred_prob = model.predict_proba(X_test)[:, 1]

ROC AUC Score: 0.8893165916135434


In [36]:
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_pred_prob
})
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.056287
1,165035,0.823677
2,165036,0.021485
3,165037,0.197354
4,165038,0.443128
