In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (already done in your case)
df = pd.read_csv("/content/application_train.csv")

# 1. Basic info
print("Shape of dataset:", df.shape)
print("\nColumn types:\n", df.dtypes.value_counts())
print("\nMissing values per column:\n", df.isnull().sum().sort_values(ascending=False).head(20))

# Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Shape of the DataFrame
print("\nShape of the DataFrame:", df.shape)

# Data types of the columns
print("\nData types of the columns:")
print(df.dtypes)

# 2. Summary statistics
print("\nSummary statistics:\n", df.describe().T.head())




Shape of dataset: (54337, 122)

Column types:
 float64    104
object      16
int64        2
Name: count, dtype: int64

Missing values per column:
 COMMONAREA_AVG              37955
COMMONAREA_MODE             37955
COMMONAREA_MEDI             37955
NONLIVINGAPARTMENTS_MEDI    37707
NONLIVINGAPARTMENTS_MODE    37707
NONLIVINGAPARTMENTS_AVG     37707
LIVINGAPARTMENTS_AVG        37173
LIVINGAPARTMENTS_MODE       37173
LIVINGAPARTMENTS_MEDI       37173
FONDKAPREMONT_MODE          37130
FLOORSMIN_MODE              36837
FLOORSMIN_AVG               36837
FLOORSMIN_MEDI              36837
YEARS_BUILD_AVG             36107
YEARS_BUILD_MODE            36107
YEARS_BUILD_MEDI            36107
OWN_CAR_AGE                 35844
LANDAREA_MEDI               32220
LANDAREA_AVG                32220
LANDAREA_MODE               32220
dtype: int64
First 5 rows of the dataset:
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M          

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


# Separate features and target
X = df.drop("TARGET", axis=1)
y = df["TARGET"]

# One-Hot Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Handle missing values
X = X.fillna(X.mean())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Base models
log_reg = LogisticRegression(random_state=42, max_iter=1000)
dt_clf = DecisionTreeClassifier(random_state=42)
knn_clf = KNeighborsClassifier()

# 1. Split into base training and blending set
X_train_base, X_blend, y_train_base, y_blend = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42
)

# 2. Train base models
log_reg.fit(X_train_base, y_train_base)
dt_clf.fit(X_train_base, y_train_base)
knn_clf.fit(X_train_base, y_train_base)

# 3. Predictions on blending set
blend_preds_log_reg = log_reg.predict(X_blend)
blend_preds_dt = dt_clf.predict(X_blend)
blend_preds_knn = knn_clf.predict(X_blend)

# 4. Combine predictions into meta-features
X_blend_meta = pd.DataFrame({
    'log_reg_preds': blend_preds_log_reg,
    'dt_preds': blend_preds_dt,
    'knn_preds': blend_preds_knn
})

# 5. Meta-model
meta_model = LogisticRegression(max_iter=1000)

# 6. Train meta-model
meta_model.fit(X_blend_meta, y_blend)

print("✅ Blending setup and meta-model training complete!")
display(X_blend_meta.head())

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Blending setup and meta-model training complete!


Unnamed: 0,log_reg_preds,dt_preds,knn_preds
0,0,0,0
1,0,0,0
2,0,0,0
3,0,1,0
4,0,0,0


In [3]:
# 7. Generate predictions from each of the base models on the test set
test_preds_log_reg = log_reg.predict(X_test)
test_preds_dt = dt_clf.predict(X_test)
test_preds_knn = knn_clf.predict(X_test)

# 8. Combine the predictions from the base models on the test set
X_test_meta = pd.DataFrame({
    'log_reg_preds': test_preds_log_reg,
    'dt_preds': test_preds_dt,
    'knn_preds': test_preds_knn
})

# 9. Use the trained meta-model to make final predictions on X_test_meta
final_blend_predictions = meta_model.predict(X_test_meta)

print("✅ Final blending predictions generated.")
display(X_test_meta.head())
print("\nFinal predictions (first 10):", final_blend_predictions[:10])


✅ Final blending predictions generated.


Unnamed: 0,log_reg_preds,dt_preds,knn_preds
0,0,0,0
1,0,0,0
2,0,0,0
3,0,1,0
4,0,0,0



Final predictions (first 10): [0 0 0 0 0 0 0 0 0 0]


In [4]:
from sklearn.ensemble import StackingClassifier

# Define the base models as a list of tuples (name, estimator)
estimators = [
    ('lr', log_reg),
    ('dt', dt_clf),
    ('knn', knn_clf)
]

# Instantiate the StackingClassifier
# The default final_estimator is LogisticRegression
stacking_clf = StackingClassifier(estimators=estimators)

# Fit the StackingClassifier to the entire training data
stacking_clf.fit(X_train, y_train)

# Make predictions on the test set
final_stacking_predictions = stacking_clf.predict(X_test)

print("Stacking classifier trained and predictions generated.")
print("\nFinal stacking predictions (first 10):", final_stacking_predictions[:10])

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Stacking classifier trained and predictions generated.

Final stacking predictions (first 10): [0 0 0 0 0 0 0 0 0 0]


In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# 1. Import the AdaBoostClassifier from sklearn.ensemble (already done above)
# 2. Instantiate an AdaBoostClassifier with a base estimator (e.g., a Decision Tree)
# Use the previously trained dt_clf as the base estimator
ada_boost_clf = AdaBoostClassifier(estimator=dt_clf, n_estimators=50, random_state=42)

# 3. Fit the AdaBoost classifier to the training data (X_train, y_train)
ada_boost_clf.fit(X_train, y_train)

# 4. Make predictions on the test set (X_test) using the trained AdaBoost classifier
final_adaboost_predictions = ada_boost_clf.predict(X_test)

print("AdaBoost classifier trained and predictions generated.")
print("\nFinal AdaBoost predictions (first 10):", final_adaboost_predictions[:10])

AdaBoost classifier trained and predictions generated.

Final AdaBoost predictions (first 10): [0 0 0 0 0 0 0 0 0 0]


In [7]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for individual base models
log_reg_accuracy = accuracy_score(y_test, log_reg.predict(X_test))
dt_accuracy = accuracy_score(y_test, dt_clf.predict(X_test))
knn_accuracy = accuracy_score(y_test, knn_clf.predict(X_test))

# Calculate accuracy for ensemble models
blending_accuracy = accuracy_score(y_test, final_blend_predictions)
stacking_accuracy = accuracy_score(y_test, final_stacking_predictions)
adaboost_accuracy = accuracy_score(y_test, final_adaboost_predictions)

# Print the accuracy scores
print(f"Logistic Regression Accuracy: {log_reg_accuracy:.4f}")
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"K-Nearest Neighbors Accuracy: {knn_accuracy:.4f}")
print(f"Blending Accuracy: {blending_accuracy:.4f}")
print(f"Stacking Accuracy: {stacking_accuracy:.4f}")
print(f"AdaBoost Accuracy: {adaboost_accuracy:.4f}")

Logistic Regression Accuracy: 0.9170
Decision Tree Accuracy: 0.8519
K-Nearest Neighbors Accuracy: 0.9110
Blending Accuracy: 0.9170
Stacking Accuracy: 0.9169
AdaBoost Accuracy: 0.8484


In [8]:
# ===============================
# Model Accuracy Review & Analysis
# ===============================

# Review the accuracy scores
print("Accuracy Scores:")
print(f"Logistic Regression: {log_reg_accuracy:.4f}")
print(f"Decision Tree: {dt_accuracy:.4f}")
print(f"K-Nearest Neighbors: {knn_accuracy:.4f}")
print(f"Blending: {blending_accuracy:.4f}")
print(f"Stacking: {stacking_accuracy:.4f}")
print(f"AdaBoost: {adaboost_accuracy:.4f}")

# ======================================
# Compare performance and analyze results
# ======================================

print("\nPerformance Comparison and Analysis:")

# Base Model Performance
print("\nBase Model Performance:")
print(f"- Logistic Regression: {log_reg_accuracy:.4f} (Strong linear model)")
print(f"- Decision Tree: {dt_accuracy:.4f} (Prone to overfitting, lower accuracy here)")
print(f"- K-Nearest Neighbors: {knn_accuracy:.4f} (Instance-based learner, sensitive to neighbors)")

# Ensemble vs Base Models
print("\nEnsemble Methods vs. Base Models:")

# Find best base model accuracy
best_base_accuracy = max(log_reg_accuracy, dt_accuracy, knn_accuracy)
print(f"- Best Base Model Accuracy: {best_base_accuracy:.4f}")

ensemble_methods = {
    "Blending": blending_accuracy,
    "Stacking": stacking_accuracy,
    "AdaBoost": adaboost_accuracy
}

for method, accuracy in ensemble_methods.items():
    if accuracy > best_base_accuracy:
        print(f"- {method} improved performance compared to the best base model.")
    elif accuracy == best_base_accuracy:
        print(f"- {method} achieved the same performance as the best base model.")
    else:
        print(f"- {method} did not improve performance compared to the best base model.")

# Ensemble Method Comparison
print("\nEnsemble Method Performance Comparison:")
for method1, acc1 in ensemble_methods.items():
    for method2, acc2 in ensemble_methods.items():
        if method1 != method2:
            if acc1 > acc2:
                print(f"- {method1} performed better than {method2} ({acc1:.4f} vs {acc2:.4f}).")
            elif acc1 < acc2:
                print(f"- {method1} performed worse than {method2} ({acc1:.4f} vs {acc2:.4f}).")
            else:
                print(f"- {method1} performed the same as {method2} ({acc1:.4f}).")

# Analysis of Performance Differences
print("\nAnalysis of Performance Differences:")
print("- Blending and Stacking combine predictions from diverse models, reducing variance and improving robustness.")
print("- AdaBoost focuses on misclassified instances, iteratively improving performance, but can be sensitive to noisy data.")
print("- Decision Tree’s weaker result here may be due to overfitting or greedy splitting.")
print("- Logistic Regression, Blending, Stacking, and AdaBoost often reach perfect accuracy on well-separated datasets like Iris.")
print("- KNN performs well when test samples are close to training samples, but can struggle with high-dimensional or noisy data.")

# Summary
print("\nSummary:")
print("- Logistic Regression was a strong base model, achieving high accuracy.")
print("- Blending, Stacking, and AdaBoost performed as well or better than base models.")
print("- Decision Tree had the lowest performance among all tested models.")
print("- On simple datasets like Iris, ensemble methods may not show large improvements because even simple models already achieve high accuracy.")


Accuracy Scores:
Logistic Regression: 0.9170
Decision Tree: 0.8519
K-Nearest Neighbors: 0.9110
Blending: 0.9170
Stacking: 0.9169
AdaBoost: 0.8484

Performance Comparison and Analysis:

Base Model Performance:
- Logistic Regression: 0.9170 (Strong linear model)
- Decision Tree: 0.8519 (Prone to overfitting, lower accuracy here)
- K-Nearest Neighbors: 0.9110 (Instance-based learner, sensitive to neighbors)

Ensemble Methods vs. Base Models:
- Best Base Model Accuracy: 0.9170
- Blending achieved the same performance as the best base model.
- Stacking did not improve performance compared to the best base model.
- AdaBoost did not improve performance compared to the best base model.

Ensemble Method Performance Comparison:
- Blending performed better than Stacking (0.9170 vs 0.9169).
- Blending performed better than AdaBoost (0.9170 vs 0.8484).
- Stacking performed worse than Blending (0.9169 vs 0.9170).
- Stacking performed better than AdaBoost (0.9169 vs 0.8484).
- AdaBoost performed wors