In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [3]:
df.nunique()

id                 165034
CustomerId          23221
Surname              2797
CreditScore           457
Geography               3
Gender                  2
Age                    71
Tenure                 11
Balance             30075
NumOfProducts           4
HasCrCard               2
IsActiveMember          2
EstimatedSalary     55298
Exited                  2
dtype: int64

In [4]:
to_remove = ['CustomerId', 'id', 'Surname']
df = df.drop(columns=to_remove)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [5]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

### Grouping and Splitting Data

In [6]:
target = ['Exited']
categorical = ['Geography', 'Gender']
numerical = [col for col in df if col not in (categorical+target)]

In [49]:
from sklearn.model_selection import train_test_split

X = df.drop(target, axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

#validation
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((132027, 10), (33007, 10), (132027,), (33007,))

In [50]:
X_train.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
dtype: object

### Preprocessing

In [51]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = encoder.fit_transform(X_train[categorical])
onehot_df = pd.DataFrame(onehot_encoded, columns=encoder.get_feature_names_out(), index=X_train.index)
X_train = X_train.drop(columns=categorical)
X_train = pd.concat([X_train, onehot_df], axis=1)

In [52]:
X_train.shape

(132027, 13)

In [53]:
encoder = OneHotEncoder(sparse_output=False)
onehot_encoded_test = encoder.fit_transform(X_test[categorical])
onehot_df_test = pd.DataFrame(onehot_encoded_test, columns=encoder.get_feature_names_out(), index=X_test.index)
X_test = X_test.drop(columns=categorical)
X_test = pd.concat([X_test, onehot_df_test], axis=1)

In [54]:
X_test.shape

(33007, 13)

In [55]:
columns = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
scaler = MinMaxScaler()
scaled1 = scaler.fit_transform(X_train[columns])
scaled2 = scaler.fit_transform(X_test[columns])
X_train.loc[:, columns] = scaled1
X_test.loc[:, columns] = scaled2

In [56]:
sc = MinMaxScaler()
X_train_sc = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test_sc = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

In [57]:
X_train_sc

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.700,0.175676,0.2,0.546753,0.000000,1.0,0.0,0.567785,0.0,0.0,1.0,0.0,1.0
1,0.408,0.148649,0.8,0.310526,0.000000,1.0,0.0,0.567785,0.0,0.0,1.0,0.0,1.0
2,0.378,0.162162,0.1,0.000000,0.000000,1.0,1.0,0.505835,1.0,0.0,0.0,0.0,1.0
3,0.498,0.229730,0.0,0.000000,0.333333,0.0,0.0,0.383166,0.0,0.0,1.0,1.0,0.0
4,0.596,0.216216,0.3,0.432179,0.333333,1.0,1.0,0.279256,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132022,0.658,0.283784,0.2,0.517564,0.000000,1.0,0.0,0.482213,0.0,1.0,0.0,0.0,1.0
132023,0.460,0.148649,0.6,0.418905,0.000000,0.0,1.0,0.560518,1.0,0.0,0.0,1.0,0.0
132024,0.482,0.337838,0.6,0.000000,0.333333,0.0,1.0,0.933307,1.0,0.0,0.0,0.0,1.0
132025,0.694,0.202703,0.8,0.000000,0.000000,0.0,0.0,0.367799,1.0,0.0,0.0,1.0,0.0


In [58]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=5
)

X_train_sc, y_train = sm.fit_resample(X_train_sc, y_train)

In [59]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

best_features = SelectKBest(score_func=chi2, k=9)
fit = best_features.fit(X_train_sc, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train_sc.columns)
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)
feature_scores.columns = ['Specs', 'Score']
print(feature_scores.nlargest(10, 'Score'))

                Specs        Score
9   Geography_Germany  8724.831532
6      IsActiveMember  8138.826267
11      Gender_Female  3583.734596
12        Gender_Male  3441.859311
4       NumOfProducts  3134.997457
8    Geography_France  2666.689762
1                 Age  1724.680119
3             Balance  1406.374781
10    Geography_Spain   618.034251
5           HasCrCard    47.170781


In [44]:
# from sklearn.decomposition import PCA
# pca = PCA()
# pca.fit(X_train_sc)

# cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
# n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1  # Ubah 0.95 menjadi target varians Anda

In [45]:
# n_components

8

In [46]:
# pca_fix = PCA(n_components=8)
# X_train_sc = pca_fix.fit_transform(X_train_sc)
# X_train_sc.shape

(208180, 8)

### Modeling

In [41]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    accuracy_score
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [60]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

models = []
models.append(('RF', RandomForestClassifier()))
# models.append(('SVML', LinearSVC(max_iter=10000)))
results = []
names = []

for name, model in models:
  kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
  cv_results = cross_val_score(model, X_train_sc, y_train, cv=kfold, scoring='accuracy')
  results.append(cv_results)
  names.append(name)
  print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

RF: 0.899313 (0.001928)


#### KNN

In [83]:
knn = KNeighborsClassifier(weights='distance', metric='euclidean')

param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_sc, y_train)
best_k = grid_search.best_params_['n_neighbors']
print("Best k :", best_k)

model_knn = KNeighborsClassifier(n_neighbors=best_k, weights='distance', metric='euclidean')
model_knn.fit(X_train_sc, y_train)

prediction_knn = model_knn.predict(X_test_sc)

Best k : 3


In [84]:
print(classification_report(y_test, prediction_knn))

              precision    recall  f1-score   support

           0       0.90      0.80      0.85     26023
           1       0.48      0.67      0.56      6984

    accuracy                           0.77     33007
   macro avg       0.69      0.74      0.70     33007
weighted avg       0.81      0.77      0.79     33007



In [23]:

rf = RandomForestClassifier()
rf.fit(X_train_sc, y_train)

pred_rm = rf.predict(X_test_sc)

In [26]:
accuracy = accuracy_score(y_test, pred_rm)
print("Accuracy:", accuracy)

Accuracy: 0.8443663465325537


In [56]:
print(classification_report(y_test, pred_rm))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90     26023
           1       0.63      0.65      0.64      6984

    accuracy                           0.85     33007
   macro avg       0.77      0.77      0.77     33007
weighted avg       0.85      0.85      0.85     33007



#### SVM

In [68]:
svm_model = SVC()

# # param_grid = {
# #     'C': [0.1, 1, 10, 100],
# #     'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
# #     'gamma': ['scale', 'auto']
# # }

# # grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
# # grid_search.fit(X_res, yy_train_sc,
# # best_params = grid_search.best_params_
# # print("Best Hyperparameters:", best_params)


# model_svm = SVC(**best_params)

svm_model.fit(X_train_sc, y_train)

# # Lakukan prediksi pada data uji
svm_prediction = svm_model.predict(X_test_sc)

In [17]:
print(classification_report(y_test, svm_prediction))

#### Decision Tree

In [18]:
dt = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=5, min_samples_leaf=7)
dt.fit(X_train_sc, y_train)

prediction_dt = dt.predict(X_test_sc)

In [19]:
print(classification_report(y_test, prediction_dt))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91     26023
           1       0.73      0.51      0.60      6984

    accuracy                           0.86     33007
   macro avg       0.80      0.73      0.75     33007
weighted avg       0.85      0.86      0.85     33007



#### NB

In [20]:
nb = GaussianNB()
nb.fit(X_train_sc, y_train)
prediction_nb = nb.predict(X_test_sc)

In [21]:
print(classification_report(y_test, prediction_nb))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88     26023
           1       0.53      0.48      0.51      6984

    accuracy                           0.80     33007
   macro avg       0.70      0.68      0.69     33007
weighted avg       0.79      0.80      0.80     33007



### For Submission

In [28]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [29]:
id = df_test.pop('id')

In [30]:
to_remove2 = ['CustomerId', 'Surname']
df_test = df_test.drop(columns=to_remove2)
df_test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [31]:
df_test.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [32]:
encoder = OneHotEncoder(sparse_output=False)
onehot_encoded2 = encoder.fit_transform(df_test[categorical])
onehot_df2 = pd.DataFrame(onehot_encoded2, columns=encoder.get_feature_names_out(), index=df_test.index)
df_test = df_test.drop(columns=categorical)
df_test = pd.concat([df_test, onehot_df2], axis=1)

In [33]:
df_test_sc = pd.DataFrame(sc.fit_transform(df_test), columns=df_test.columns)

In [34]:
df_test_sc.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.472,0.067568,0.2,0.0,0.333333,0.0,1.0,0.804903,1.0,0.0,0.0,1.0,0.0
1,0.666,0.378378,0.2,0.0,0.0,1.0,0.0,0.362723,1.0,0.0,0.0,1.0,0.0
2,0.612,0.216216,0.7,0.0,0.333333,1.0,0.0,0.694419,1.0,0.0,0.0,1.0,0.0
3,0.662,0.243243,0.8,0.0,0.0,1.0,0.0,0.569654,1.0,0.0,0.0,0.0,1.0
4,0.804,0.27027,1.0,0.483318,0.0,1.0,0.0,0.697164,0.0,1.0,0.0,0.0,1.0


In [36]:
pred = rf.predict_proba(df_test_sc)[:, 1]

In [37]:
pred


array([0.05, 0.93, 0.01, ..., 0.01, 0.21, 0.31])

In [38]:
output = pd.DataFrame({'id': id,
                       'Exited': pred
                       })

In [39]:
output.to_csv('submission_rf2.csv', index=False)