In [13]:

from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #to ignore warning  0=ALL, 1=INFO, 2=WARNING, 3=ERROR

import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping




In [14]:

X_train = pd.read_csv('/kaggle/input/x-train/X_train.csv')
y_train = pd.read_csv('/kaggle/input/y-train/y_train.csv')
X_test = pd.read_csv('/kaggle/input/test-data/X_test.csv')
y_test = pd.read_csv('/kaggle/input/test-data/y_test.csv')

### Use embedding layers for high-cardinality categorical features 

In [15]:
gender_col_index = 1 
gender_train = X_train.iloc[:, gender_col_index].astype('int32').values
gender_test  = X_test.iloc[:, gender_col_index].astype('int32').values

X_train_num = X_train.drop(X_train.columns[gender_col_index], axis=1).values
X_test_num  = X_test.drop(X_test.columns[gender_col_index], axis=1).values


### 1 - Use cluster memberships

In [16]:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters_train = kmeans.fit_predict(X_train)
clusters_test = kmeans.predict(X_test)

X_train_clustered = np.hstack([X_train, clusters_train.reshape(-1,1)])
X_test_clustered = np.hstack([X_test, clusters_test.reshape(-1,1)])

### 2 -  Autoencoder for feature extraction  (Automatic feature engineering )

##### increase Accuracy , Recall  , Generalization  And decrease Overfitting 

In [17]:
input_dim = X_train_clustered.shape[1]
encoding_dim = 8 

# Encoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation='relu')(input_layer)
encoded = Dense(16, activation='relu')(encoded)
encoded = Dense(encoding_dim, activation='relu')(encoded)


# Decoder 
decoded = Dense(16, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(decoded)
decoded = Dense(input_dim, activation='linear')(decoded)


autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train autoencoder (unsupervised)


# Early stopping
early_stop = EarlyStopping(
    monitor='val_loss',     
    patience=5,             
    restore_best_weights=True 
)

autoencoder.fit(
    X_train_clustered, 
    X_train_clustered,
    epochs=100,             
    batch_size=32,
    validation_data=(X_test_clustered, X_test_clustered),
    callbacks=[early_stop], 
    verbose=1
)


# Use encoder to get new features
encoder = Model(input_layer, encoded)
X_train_encoded = encoder.predict(X_train_clustered)
X_test_encoded = encoder.predict(X_test_clustered)

# Combine original + cluster + encoded features
X_train_final = np.concatenate([X_train, X_train_encoded], axis=1)
X_test_final = np.concatenate([X_test, X_test_encoded], axis=1)

print("Final feature size:", X_train_final.shape)



Epoch 1/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 282.8815 - val_loss: 3.2769
Epoch 2/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 2.2022 - val_loss: 0.3950
Epoch 3/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.2689 - val_loss: 0.3907
Epoch 4/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.2597 - val_loss: 0.3800
Epoch 5/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.2511 - val_loss: 0.3546
Epoch 6/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.2292 - val_loss: 0.3393
Epoch 7/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.2039 - val_loss: 0.3304
Epoch 8/100
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.2004 - val_loss: 0.3309
Epoch 9/100
[

In [None]:
model = Sequential()

# Hidden Layers
model.add(Dense(128, input_shape=(X_train_final.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))



model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


history = model.fit(
    X_train_final, y_train,
    validation_data=(X_test_final, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[es]
)


loss, accuracy = model.evaluate(X_test_final, y_test)
print("DNN Test Accuracy:", accuracy)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_final, y_train)
rf_preds = rf.predict(X_test_final)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_final, y_train)
xgb_preds = xgb.predict(X_test_final)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))

Epoch 1/50
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.7100 - loss: 0.5844 - val_accuracy: 0.4218 - val_loss: 1.1437
Epoch 2/50
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7307 - loss: 0.5597 - val_accuracy: 0.4489 - val_loss: 1.0189
Epoch 3/50
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7304 - loss: 0.5622 - val_accuracy: 0.5993 - val_loss: 0.8584
Epoch 4/50
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7339 - loss: 0.5538 - val_accuracy: 0.5944 - val_loss: 0.8982
Epoch 5/50
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7302 - loss: 0.5549 - val_accuracy: 0.6132 - val_loss: 0.8024
Epoch 6/50
[1m1143/1143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7352 - loss: 0.5523 - val_accuracy: 0.5509 - val_loss: 0.7715
Epoch 7/50
[1m1