In [14]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import json
from dash import Dash, html, dcc, Input, Output
import plotly.graph_objects as go
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
import catboost

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# from google.colab import files
# uploaded = files.upload()

# Load Dataset
df = pd.read_csv('/content/HR_comma_sep.csv')
print("Dataset shape:", df.shape)
print(df.head())

# Preprocessing
df = pd.get_dummies(df, drop_first=True)  # One-hot encode categorical features
X = df.drop(columns=['left'])  # Features
Y = df['left']  # Target

# Split Dataset
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=195)

# Scale Features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to balance the dataset
from imblearn.over_sampling import SMOTE
smote_ratio = 0.5
smote = SMOTE(random_state=42, sampling_strategy=smote_ratio)
X_train_smote, Y_train_smote = smote.fit_resample(X_train_scaled, Y_train)

# Initialize Models


new_models = {
    "Random Forest": RandomForestClassifier(random_state=195),
    "XGBoost": xgb.XGBClassifier(objective="binary:logistic", random_state=195),
    "CatBoost": catboost.CatBoostClassifier(learning_rate=0.1, iterations=500, depth=6, verbose=0, random_state=195),
    "Light GBM": lgb.LGBMClassifier(random_state=195)
}
new_param_grids = {
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ['sqrt', 'log2', None]
    },
    "XGBoost": {
        "learning_rate": [0.01, 0.05, 0.1],
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_child_weight": [1, 5, 10],
        "gamma": [0, 0.1, 0.3],
        "subsample": [0.6, 0.8],
        "colsample_bytree": [0.6, 0.8]
    },
    "CatBoost": {
        "learning_rate": [0.01, 0.03, 0.05],
        "depth": [4, 6, 8],
        "l2_leaf_reg": [3, 5, 7],
        "border_count": [32, 64, 128]
    },
    "LightGBM": {
        "learning_rate": [0.01, 0.03, 0.05],
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "num_leaves": [15, 31, 50],
        "min_data_in_leaf": [20, 30, 40],
        "lambda_l1": [0, 0.1, 0.5],
        "lambda_l2": [0, 0.1, 0.5]
    }
}

# Commented out GridSearchCV part and load saved parameters
"""
# Perform GridSearchCV for each new model
new_best_params = {}
new_results = []

kf = KFold(n_splits=5, shuffle=True, random_state=195)
for model_name, model in new_models.items():
    grid = GridSearchCV(model, new_param_grids[model_name], cv=kf, scoring='accuracy')
    grid.fit(X_train_smote, Y_train_smote)

    # Store best parameters and results
    new_best_params[model_name] = grid.best_params_
    best_model = grid.best_estimator_

    # Evaluate the best model
    train_accuracy = accuracy_score(Y_train_smote, best_model.predict(X_train_smote))
    test_accuracy = accuracy_score(Y_test, best_model.predict(X_test_scaled))

    new_results.append({
        "Model": model_name,
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy
    })

# Save the best hyperparameters for new models to a JSON file
new_params_path = '/content/drive/My Drive/best_superhyperparameters_classification_newmodels_assignment_2.json'
with open(new_params_path, "w") as f:
    json.dump(new_best_params, f)
"""
# Load Best Parameters for New Models
new_best_params_path = '/content/drive/My Drive/best_superhyperparameters_classification_newmodels_assignment_2.json'
with open(new_best_params_path, "r") as f:
    new_best_params = json.load(f)

# Re-initialize New Models with Best Parameters
optimized_new_models = {
    "Random Forest": RandomForestClassifier(**new_best_params["Random Forest"], random_state=195),
    "XGBoost": xgb.XGBClassifier(**new_best_params["XGBoost"], objective="binary:logistic", random_state=195),
    "CatBoost": catboost.CatBoostClassifier(**new_best_params["CatBoost"], verbose=0, random_state=195),
    "Light GBM": lgb.LGBMClassifier(**new_best_params["LightGBM"], random_state=195)
}

# Load Best Parameters for Old Models
best_params_path = '/content/drive/My Drive/best_hyperparameters_classification_oldmodels.json'
with open(best_params_path, "r") as f:
    best_params = json.load(f)

# Re-initialize Old Models with Best Parameters
optimized_old_models = {
    "Logistic Regression": LogisticRegression(**best_params["Logistic Regression"]),
    "Decision Tree": DecisionTreeClassifier(**best_params["Decision Tree"], random_state=195),
    "Support Vector Machine": SVC(**best_params["Support Vector Machine"]),
    "k-Nearest Neighbors": KNeighborsClassifier(**best_params["K-Nearest Neighbors"])
}

# Combine All Models
all_models = {}
all_models.update(optimized_old_models)
all_models.update(optimized_new_models)

# Train and Evaluate Models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model_metrics = []
for model_name, model in all_models.items():
    # Train with SMOTE data
    model.fit(X_train_smote, Y_train_smote)

    # Predict on Train and Test sets
    Y_train_pred = model.predict(X_train_smote)
    Y_test_pred = model.predict(X_test_scaled)

    # Calculate Metrics
    train_accuracy = accuracy_score(Y_train_smote, Y_train_pred)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)
    precision = precision_score(Y_test, Y_test_pred, average='weighted')
    recall = recall_score(Y_test, Y_test_pred, average='weighted')
    f1 = f1_score(Y_test, Y_test_pred, average='weighted')

    # Append Metrics
    model_metrics.append([model_name, train_accuracy, test_accuracy, precision, recall, f1])

# Create a DataFrame for Metrics Comparison
model_metrics_df = pd.DataFrame(model_metrics, columns=["Model", "Train Accuracy", "Test Accuracy", "Precision", "Recall", "F1 Score"])
print(model_metrics_df)

# Save Metrics to a File
metrics_path = '/content/drive/My Drive/classification_model_metrics.json'
model_metrics_df.to_json(metrics_path)

# Identify the Top 3 Models by Test Accuracy
top_3_models = model_metrics_df.sort_values(by="Test Accuracy", ascending=False).head(3)
print("Top 3 Models by Test Accuracy:")
print(top_3_models)



# Plot top 3 models
fig = px.bar(top_3_models, x='Model', y='Test Accuracy', title="Top 3 Models by Test Accuracy (Loaded Parameters)")
fig.show()

# Initialize the Dash App
app = Dash(__name__)

# List of top 3 models and corresponding hyperparameters
top_3_models_list = top_3_models['Model'].tolist()

# Dashboard Layout
app.layout = html.Div([
    # Title with Names and Matriculation Numbers
    html.H1("Classification Dashboard - Majd al hallack (604516) & Azanguim Ndongmo Larry Nelson (603846)",
            style={'textAlign': 'center', 'color': '#E7E585'}),

    # Checklist/Select Widget for Model Selection
    html.Label("Select Models to Compare:"),
    dcc.Checklist(
        id='model-checklist',
        options=[{'label': model, 'value': model} for model in top_3_models_list],
        value=top_3_models_list,  # Default selection
        inline=True
    ),

    # Tabs for Dashboard Pages
    dcc.Tabs([
        # Page 1: Data & Hyperparameters
        dcc.Tab(label='Page 1: Data & Hyperparameters', children=[
            html.Div(id='page1-content')
        ]),

        # Page 2: Performance Metrics
        dcc.Tab(label='Page 2: Performance Metrics', children=[
            html.Div(id='page2-content')
        ]),

        # Page 3: Forecasting Results
        dcc.Tab(label='Page 3: Forecasting Results', children=[
            html.Div(id='page3-content')
        ])
    ])
])


# Callbacks for Page 1: Show Head of Data and Hyperparameters
@app.callback(
    Output('page1-content', 'children'),
    Input('model-checklist', 'value')
)
def update_page1(models_selected):
    if not models_selected:
        return [html.H3("No models selected. Please select at least one model.")]

    content = []
    content.append(html.H3("Head of Pre-processed Data"))
    content.append(html.Pre(df.head().to_string()))

    content.append(html.H3("Hyperparameters for Selected Models"))
    for model in models_selected:
        hyperparams = new_best_params.get(model, "No hyperparameters found.")
        content.append(html.P(f"{model}: {hyperparams}"))
    return content


# Callbacks for Page 2: Show Performance Metrics
@app.callback(
    Output('page2-content', 'children'),
    Input('model-checklist', 'value')
)
def update_page2(models_selected):
    # Debug: print selected models
    print("Models selected:", models_selected)

    # Handle case when no models are selected
    if not models_selected:
        return html.Div([
            html.H3("Performance Metrics"),
            html.P("Please select one or more models to see the performance metrics.")
        ])

    # Filter performance results based on selected models
    filtered_results = model_metrics_df[model_metrics_df['Model'].isin(models_selected)]

    # Debug: print filtered results
    print("Filtered results:\n", filtered_results)

    # Handle case when no models match the selected models
    if filtered_results.empty:
        return html.Div([
            html.H3("Performance Metrics"),
            html.P("No performance data available for the selected models.")
        ])

    # Generate performance table
    table = html.Table([html.Tr([html.Th(col) for col in filtered_results.columns])] + [
        html.Tr([html.Td(filtered_results.iloc[i][col]) for col in filtered_results.columns])
        for i in range(len(filtered_results))
    ])

    # Generate Bar Chart for Accuracy
    fig = go.Figure()
    for model in models_selected:
        accuracy = filtered_results[filtered_results['Model'] == model]['Test Accuracy'].values[0]
        fig.add_trace(go.Bar(name=model, x=['Accuracy'], y=[accuracy]))

    fig.update_layout(barmode='group', title="Test Accuracy Comparison")

    # Return the updated content for page2
    return [
        html.H3("Performance Metrics"),
        table,
        dcc.Graph(figure=fig)
    ]

# Callbacks for Page 3: Forecasting Results
@app.callback(
    Output('page3-content', 'children'),
    Input('model-checklist', 'value')
)
def update_page3(models_selected):
    if not models_selected or len(models_selected) < 2:
        return [html.H3("Please select at least two models for forecasting.")]

    # Placeholder for forecasting data
    forecasting_data = pd.DataFrame({
        'True Value': [1, 0, 1, 0],
        f'{models_selected[0]} Prediction': [1, 0, 1, 0],
        f'{models_selected[1]} Prediction': [1, 0, 0, 1]
    })

    forecasting_data['Wrong Classifications'] = forecasting_data.iloc[:, 1:].ne(
        forecasting_data['True Value'], axis=0).sum(axis=1)
    forecasting_data = forecasting_data.sort_values(by='Wrong Classifications', ascending=False)

    table = html.Table([
        html.Tr([html.Th(col) for col in forecasting_data.columns])
    ] + [
        html.Tr([html.Td(forecasting_data.iloc[i][col]) for col in forecasting_data.columns])
        for i in range(len(forecasting_data))
    ])

    return [html.H3("Forecasting Results"), table]


# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset shape: (14999, 10)
   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.38             0.53               2                   157   
1                0.80             0.86               5                   262   
2                0.11             0.88               7                   272   
3                0.72             0.87               5                   223   
4                0.37             0.52               2                   159   

   time_spend_company  Work_accident  left  promotion_last_5years Department  \
0                   3              0     1                      0      sales   
1                   6              0     1                      0      sales   
2                   4              0     1                      0      sales   
3                   5              0     1 

<IPython.core.display.Javascript object>

In [24]:
#trying to improve further more by applying classvoting
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# GridSearchCV results (best parameters from previous search)
rf_best_params = {"max_depth": 7, "max_features": None, "min_samples_leaf": 1, "min_samples_split": 2, "n_estimators": 100}
xgb_best_params = {"colsample_bytree": 0.8, "gamma": 0, "learning_rate": 0.1, "max_depth": 7, "min_child_weight": 1, "n_estimators": 200, "subsample": 0.6}
catboost_best_params = {"border_count": 128, "depth": 8, "l2_leaf_reg": 7, "learning_rate": 0.05}
lgbm_best_params = {"lambda_l1": 0, "lambda_l2": 0, "learning_rate": 0.05, "max_depth": 7, "min_data_in_leaf": 20, "n_estimators": 200, "num_leaves": 50}

# Initialize individual models with the best parameters
rf_model = RandomForestClassifier(**rf_best_params)
xgb_model = xgb.XGBClassifier(**xgb_best_params)
catboost_model = CatBoostClassifier(**catboost_best_params, silent=True)
lgbm_model = lgb.LGBMClassifier(**lgbm_best_params)

# Create a VotingClassifier (majority voting)
voting_clf = VotingClassifier(estimators=[
    ('random_forest', rf_model),
    ('xgboost', xgb_model),
    ('catboost', catboost_model),
    ('lightgbm', lgbm_model)
], voting='hard')

# Train all individual models and the Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('random_forest', rf_model),
    ('xgboost', xgb_model),
    ('catboost', catboost_model),
    ('lightgbm', lgbm_model)
], voting='soft', weights=[1, 4, 8, 2])  # Adjust weights to get the best out of the top performing models like catboost,xgboost,and lightGBM

# Initialize dictionary to store evaluation metrics
model_metrics = []

# Train and evaluate each model
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_smote, Y_train_smote)

    # Predict on the test set
    Y_train_pred = model.predict(X_train_smote)
    Y_test_pred = model.predict(X_test_scaled)

    # Calculate metrics
    train_accuracy = accuracy_score(Y_train_smote, Y_train_pred)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)
    precision = precision_score(Y_test, Y_test_pred, average='weighted')
    recall = recall_score(Y_test, Y_test_pred, average='weighted')
    f1 = f1_score(Y_test, Y_test_pred, average='weighted')

    # Append metrics to the list
    model_metrics.append([model_name, train_accuracy, test_accuracy, precision, recall, f1])

# Create a DataFrame for Metrics Comparison
model_metrics_df = pd.DataFrame(model_metrics, columns=["Model", "Train Accuracy", "Test Accuracy", "Precision", "Recall", "F1 Score"])

# Print the comparison of all models
print(model_metrics_df)


[LightGBM] [Info] Number of positive: 4552, number of negative: 9105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 893
[LightGBM] [Info] Number of data points in the train set: 13657, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333309 -> initscore=-0.693257
[LightGBM] [Info] Start training from score -0.693257
[LightGBM] [Info] Number of positive: 4552, number of negative: 9105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 893
[LightGBM] [Info] Number of data points in the train set: 13657, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333309 -> initscore=-0.693257
[LightGBM] [Info] Start training from score -0.693257
               Mod

In [None]:
!pip install dash
!pip install catboost