# Name: Aadit Harshal Baldha
# Batch: Fall 2024 (2nd Sem)
# CWID: 20029691

## Topic: ML: Fundamentals and Applications Project Individual Work for Cluster Group #0

### 3.3 Building the Training Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

clustered_df = pd.read_csv('output/clustered_data.csv')

# Step 1: Prepare data
X = clustered_df.drop(columns=['Cluster'])
y = clustered_df['Cluster']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 3: Define the model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(y.unique()),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Step 4: Set up Grid Search Parameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Step 5: Cross-Validation Strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 6: Grid Search with Cross-Validation
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Step 7: Best Model
best_model = grid_search.best_estimator_
classification_model_best_params = grid_search.best_params_
print("Best Parameters Found:", grid_search.best_params_)

# Step 8: Predict
y_pred = best_model.predict(X_test)

# Step 9: Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=sorted(y.unique()), yticklabels=sorted(y.unique()))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - XGBoost Prediction with GridSearchCV')
plt.show()


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Separate features and labels
X = clustered_df[clustered_df['Cluster']==0].drop(columns=['Bankrupt?'])  # Replace 'Bankrupt?' with your target column
y = clustered_df[clustered_df['Cluster']==0]['Bankrupt?']


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE  # Importing SMOTE for oversampling

# Define your X and y
# X = <your feature matrix>
# y = <your target labels>

# Train-test split (Important to have a separate holdout set later)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Cross-validation strategy (Stratified ensures balanced class proportions across folds)
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 1: Define base models and their grids, including class_weight for imbalance handling
param_grid_dt = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', None]  # Handling class imbalance
}

param_grid_knn = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7],
    'kneighborsclassifier__weights': ['uniform', 'distance']
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7, 10],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced', None]  # Handling class imbalance
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'scale_pos_weight': [1, 2, 3]  # Helps in handling imbalanced classes in XGBoost
}

# Step 2: Initialize models
models = {
    'dt': (DecisionTreeClassifier(random_state=42), param_grid_dt),
    'knn': (make_pipeline(StandardScaler(), KNeighborsClassifier()), param_grid_knn),
    'rf': (RandomForestClassifier(random_state=42), param_grid_rf),
    'xgb': (XGBClassifier(
        use_label_encoder=False, 
        eval_metric='logloss', 
        random_state=42
    ), param_grid_xgb)
}

# Step 3: Grid Search for each base model, including class weight adjustments
best_estimators = {}

for name, (model, param_grid) in models.items():
    print(f"Tuning {name.upper()}...")
    gs = GridSearchCV(model, param_grid, cv=cv_strategy, n_jobs=-1, scoring='accuracy')
    gs.fit(X_train, y_train)
    best_estimators[name] = gs.best_estimator_
    print(f"Best params for {name.upper()}: {gs.best_params_}")
    print(f"Best CV Accuracy: {gs.best_score_:.4f}")
    print("-" * 60)

# Step 4: Apply SMOTE for oversampling the minority class in the training set
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 5: Build the Stacking Classifier
stack_model = StackingClassifier(
    estimators=[(name, model) for name, model in best_estimators.items()],
    final_estimator=LogisticRegression(class_weight='balanced'),  # Handling imbalance in the final model
    cv=cv_strategy,
    n_jobs=-1,
    passthrough=True
)

# Step 6: Train Stacking Classifier with resampled data
stack_model.fit(X_train_resampled, y_train_resampled)

# Step 7: Evaluate
y_pred = stack_model.predict(X_test)

# Classification Report
print("\nClassification Report for Stacked Model:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Stacked Model')
plt.show()


In [None]:
meta_model = LogisticRegression(random_state=42)


In [None]:
stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    passthrough=True,
    n_jobs=-1
)
stack_model.fit(X,y)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

y_pred = cross_val_predict(stack_model, X, y, cv=cv, method='predict')

acc = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

print(f"Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y, y_pred))

print("\nConfusion Matrix:")
print(cm)
