In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.ensemble import StackingClassifier
import joblib

In [2]:
train_data = pd.read_csv("transformed_train.csv")
test_data = pd.read_csv("transformed_test.csv")

In [3]:
X = train_data.drop(columns=['timestamp', 'target'])
y = train_data['target']

In [4]:
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [6]:
# Train individual models
# Logistic Regression Model
lr_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

In [7]:
# Random Forest Model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=2, class_weight='balanced', n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

In [8]:
# XGBoost Model
xgb_model = xgb.XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)

In [21]:
# Create a stacking ensemble model with XGBoost as the meta-model
stacking_model = StackingClassifier(estimators=[
    ('logistic', lr_model),
    ('random_forest', rf_model),
    ('xgboost', xgb_model)
], final_estimator=xgb.XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='logloss'), cv=3, n_jobs=-1)


In [22]:
# Train the stacking model
stacking_model.fit(X_train_scaled, y_train)

In [23]:
# Evaluate the stacking model on the validation set
y_val_pred_stacking = stacking_model.predict(X_val_scaled)
print("Stacking Model with XGBoost Meta-Model Performance:")
print(classification_report(y_val, y_val_pred_stacking))

Stacking Model with XGBoost Meta-Model Performance:
              precision    recall  f1-score   support

         0.0       0.53      0.86      0.66    222795
         1.0       0.51      0.16      0.25    201692

    accuracy                           0.53    424487
   macro avg       0.52      0.51      0.45    424487
weighted avg       0.52      0.53      0.46    424487



In [24]:
# Save the trained stacking model and scaler for future use
joblib.dump(stacking_model, "stacking_model_xgboost_meta.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [25]:
# Apply the scaler to the test data for predictions
X_test = test_data.drop(columns=['timestamp', 'row_id'])
X_test_scaled = scaler.transform(X_test)

In [26]:
# Make predictions on the test set using the stacking model
y_test_pred_stacking = stacking_model.predict(X_test_scaled)

In [27]:
# Convert predictions to integers
y_test_pred_stacking = y_test_pred_stacking.astype(int)

In [28]:
# Create the submission file with original row_id from test data
submission_blend = pd.DataFrame({
    'row_id': test_data['row_id'],
    'target': test_pred_blend_class
})

In [29]:
full_submission_blend = pd.DataFrame({'row_id': range(909617)})
submission_blend = pd.merge(full_submission_blend, submission_blend, on='row_id', how='left').fillna({'target': 0}).astype({'target': 'int'})

In [30]:
submission_blend.to_csv("submission_stacking_Xgboost.csv", index=False)