<a href="https://colab.research.google.com/github/anjalii-s/HR-Analytics-and-Attrition-Prediction-System/blob/main/HR_Analytics_%26_Attrition_Prediction_System_Jupyter_Widgets_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## HR ANALYTICS AND ATTRITION PREDICTION



Dataset used is the IBM HR Analytics Employee Attrition & Performance available at :https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset

Import necessary libraries

In [16]:
# Run this cell first to install all required packages
!pip install ipywidgets -q
# Import all required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")

‚úÖ All libraries imported successfully!


In [17]:
!jupyter nbextension enable --py widgetsnbextension


Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [18]:
# Initialize global variables for model storage
current_model = None
current_scaler = None
current_encoders = None
categorical_columns = ['Department', 'JobRole', 'OverTime']

print("‚úÖ Global variables initialized!")

‚úÖ Global variables initialized!


Data processing and ML functions-Random Forest

In [19]:
def preprocess_data(df, encoders=None, fit_encoders=True):
    """
    Preprocess the data for training or prediction
    """
    df_processed = df.copy()

    # Initialize encoders if not provided
    if encoders is None:
        encoders = {}

    # Handle categorical variables
    for col in categorical_columns:
        if col in df_processed.columns:
            if fit_encoders:
                encoders[col] = LabelEncoder()
                df_processed[col] = encoders[col].fit_transform(df_processed[col].astype(str))
            else:
                if col in encoders:
                    # Handle unseen categories by mapping them to a default value
                    unique_values = set(df_processed[col].astype(str))
                    trained_classes = set(encoders[col].classes_)

                    # For unseen values, replace with the most common class
                    for value in unique_values - trained_classes:
                        df_processed[col] = df_processed[col].replace(value, encoders[col].classes_[0])

                    df_processed[col] = encoders[col].transform(df_processed[col].astype(str))
                else:
                    # If encoder doesn't exist for this column, create one
                    encoders[col] = LabelEncoder()
                    df_processed[col] = encoders[col].fit_transform(df_processed[col].astype(str))

    return df_processed, encoders

def train_model(df):
    """
    Train a machine learning model on the HR data
    """
    global current_model, current_scaler, current_encoders

    try:
        # Preprocess the data
        X = df.drop('Attrition', axis=1)
        y = df['Attrition']

        X_processed, current_encoders = preprocess_data(X)

        # Initialize and fit scaler
        current_scaler = StandardScaler()
        X_scaled = current_scaler.fit_transform(X_processed)

        # Train model
        current_model = RandomForestClassifier(n_estimators=100, random_state=42)
        current_model.fit(X_scaled, y)

        # Calculate accuracy
        y_pred = current_model.predict(X_scaled)
        accuracy = accuracy_score(y, y_pred)

        return f"‚úÖ Model trained successfully! Training Accuracy: {accuracy:.2%}"

    except Exception as e:
        return f"‚ùå Error training model: {str(e)}"

def predict_attrition_risk(age, monthly_income, department, job_role, overtime, job_satisfaction, years_at_company):
    """
    Predict attrition risk for a single employee
    """
    global current_model, current_scaler, current_encoders

    if current_model is None:
        return "‚ö†Ô∏è Please train a model first using the 'Model Training' tab!"

    try:
        # Create input dataframe
        input_data = pd.DataFrame({
            'Age': [age],
            'MonthlyIncome': [monthly_income],
            'Department': [department],
            'JobRole': [job_role],
            'OverTime': [overtime],
            'JobSatisfaction': [job_satisfaction],
            'YearsAtCompany': [years_at_company]
        })

        # Preprocess the input data
        input_processed, _ = preprocess_data(input_data, current_encoders, fit_encoders=False)

        # Scale the features
        input_scaled = current_scaler.transform(input_processed)

        # Make prediction
        prediction = current_model.predict(input_scaled)[0]
        probability = current_model.predict_proba(input_scaled)[0]

        risk_level = "üî¥ High Risk" if prediction == 1 else "üü¢ Low Risk"
        confidence = probability[1] if prediction == 1 else probability[0]

        return f"üéØ Attrition Risk: {risk_level}\nüìä Confidence: {confidence:.2%}"

    except Exception as e:
        return f"‚ùå Error making prediction: {str(e)}"

print("‚úÖ Data processing and ML functions defined!")

‚úÖ Data processing and ML functions defined!


# Define UI functions

In [21]:
def create_data_exploration_tab():
    """
    Create the data exploration tab
    """
    explore_button = widgets.Button(description="Explore Data", button_style='info')
    output = widgets.Output()

    def on_explore_click(b):
        with output:
            clear_output()
            try:
                # Try multiple possible filenames
                filenames = ['HR_IBM_dataset.csv', 'HR-Employee-Attrition.csv', 'WA_Fn-UseC_-HR-Employee-Attrition.csv']
                df = None
                used_filename = ""

                for filename in filenames:
                    try:
                        df = pd.read_csv(filename)
                        used_filename = filename
                        print(f"‚úÖ Loaded dataset: {filename}")
                        break
                    except:
                        continue

                if df is None:
                    print("‚ùå Could not find dataset file.")
                    print("Please make sure one of these files exists in your directory:")
                    for filename in filenames:
                        print(f"  - {filename}")
                    return

                print("=== Dataset Overview ===")
                print(f"Dataset: {used_filename}")
                print(f"Shape: {df.shape} (rows: {df.shape[0]}, columns: {df.shape[1]})")

                print("\nFirst 5 rows:")
                display(df.head())

                print("\n=== Basic Statistics ===")
                display(df.describe())

                print("\n=== Column Information ===")
                print(f"Total columns: {len(df.columns)}")
                print("\nColumn names:")
                for i, col in enumerate(df.columns):
                    print(f"  {i+1:2d}. {col}")

                # Check for Attrition column
                if 'Attrition' in df.columns:
                    print("\n=== Attrition Distribution ===")
                    attrition_counts = df['Attrition'].value_counts()
                    print(attrition_counts)

                    # Visualization
                    plt.figure(figsize=(12, 5))

                    plt.subplot(1, 2, 1)
                    df['Attrition'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
                    plt.title('Attrition Distribution')
                    plt.xlabel('Attrition')
                    plt.ylabel('Count')
                    plt.xticks(rotation=0)

                    # Age distribution if available
                    if 'Age' in df.columns:
                        plt.subplot(1, 2, 2)
                        df['Age'].hist(bins=20, color='lightgreen', edgecolor='black')
                        plt.title('Age Distribution')
                        plt.xlabel('Age')
                        plt.ylabel('Frequency')
                    else:
                        plt.subplot(1, 2, 2)
                        plt.text(0.5, 0.5, 'Age column not available',
                                ha='center', va='center', transform=plt.gca().transAxes)
                        plt.title('Age Distribution - Not Available')

                    plt.tight_layout()
                    plt.show()

                    print("\n‚úÖ Data exploration completed successfully!")
                else:
                    print("‚ùå 'Attrition' column not found in dataset.")
                    print("Available columns:", list(df.columns))

            except Exception as e:
                print(f"‚ùå Error loading data: {e}")

    explore_button.on_click(on_explore_click)

    return widgets.VBox([
        widgets.HTML("<h3>üìä Data Exploration</h3>"),
        widgets.HTML("<p>Click below to explore the HR dataset:</p>"),
        explore_button,
        output
    ])

def create_model_training_tab():
    """
    Create the model training tab
    """
    train_button = widgets.Button(description="Train Model", button_style='success')
    output = widgets.Output()

    def on_train_click(b):
        with output:
            clear_output()
            try:
                # Try multiple possible filenames
                filenames = ['HR_IBM_dataset.csv', 'HR-Employee-Attrition.csv', 'WA_Fn-UseC_-HR-Employee-Attrition.csv']
                df = None
                used_filename = ""

                for filename in filenames:
                    try:
                        df = pd.read_csv(filename)
                        used_filename = filename
                        break
                    except:
                        continue

                if df is None:
                    print("‚ùå Could not find dataset file.")
                    return

                print(f"üìÅ Using dataset: {used_filename}")

                # Convert Attrition to binary (Yes=1, No=0)
                if 'Attrition' in df.columns:
                    df['Attrition'] = (df['Attrition'] == 'Yes').astype(int)
                    print("‚úÖ Converted Attrition to binary values")
                else:
                    print("‚ùå 'Attrition' column not found in dataset.")
                    print("Available columns:", list(df.columns))
                    return

                # Select relevant features for prediction
                features = ['Age', 'MonthlyIncome', 'Department', 'JobRole',
                           'OverTime', 'JobSatisfaction', 'YearsAtCompany', 'Attrition']

                # Check which features actually exist
                available_features = [col for col in features if col in df.columns]
                missing_features = [col for col in features if col not in df.columns]

                print(f"‚úÖ Available features: {len(available_features)}/{len(features)}")
                for feature in available_features:
                    print(f"   - {feature}")

                if missing_features:
                    print(f"‚ö†Ô∏è Missing features: {missing_features}")

                if len(available_features) >= 4:  # Need at least a few features + target
                    # Use only available features
                    features_to_use = [f for f in features if f in df.columns and f != 'Attrition'] + ['Attrition']
                    df_subset = df[features_to_use].copy()

                    print(f"üîÑ Training model with {len(features_to_use)-1} features...")
                    result = train_model(df_subset)
                    print(result)

                    if current_model is not None:
                        print("\n‚úÖ Model trained successfully! You can now use the Prediction tab.")
                else:
                    print("‚ùå Not enough features available for training.")
                    print("Need at least 3 features + Attrition target.")

            except Exception as e:
                print(f"‚ùå Error: {e}")

    train_button.on_click(on_train_click)

    return widgets.VBox([
        widgets.HTML("<h3>ü§ñ Model Training</h3>"),
        widgets.HTML("<p>Click below to train the attrition prediction model:</p>"),
        train_button,
        output
    ])

def create_prediction_tab():
    """
    Create the prediction tab
    """
    # Input widgets
    age_slider = widgets.IntSlider(value=35, min=18, max=65, description='Age:')
    income_slider = widgets.IntSlider(value=5000, min=1000, max=20000, description='Monthly Income:')

    department_dropdown = widgets.Dropdown(
        options=['Research & Development', 'Sales', 'Human Resources'],
        value='Research & Development',
        description='Department:'
    )

    job_role_dropdown = widgets.Dropdown(
        options=['Research Scientist', 'Sales Executive', 'Human Resources',
                'Laboratory Technician', 'Manufacturing Director', 'Manager',
                'Research Director', 'Healthcare Representative'],
        value='Research Scientist',
        description='Job Role:'
    )

    overtime_dropdown = widgets.Dropdown(
        options=['No', 'Yes'],
        value='No',
        description='OverTime:'
    )

    job_satisfaction_slider = widgets.IntSlider(value=3, min=1, max=4, description='Job Satisfaction:')
    years_at_company_slider = widgets.IntSlider(value=5, min=0, max=40, description='Years at Company:')

    predict_button = widgets.Button(description="Predict Attrition Risk", button_style='warning')
    output = widgets.Output()

    def on_predict_click(b):
        with output:
            clear_output()
            print("üîÑ Making prediction...")
            result = predict_attrition_risk(
                age_slider.value,
                income_slider.value,
                department_dropdown.value,
                job_role_dropdown.value,
                overtime_dropdown.value,
                job_satisfaction_slider.value,
                years_at_company_slider.value
            )
            print("\n" + "="*50)
            print("PREDICTION RESULT:")
            print("="*50)
            print(result)
            print("="*50)

    predict_button.on_click(on_predict_click)

    # Layout
    input_controls = widgets.VBox([
        widgets.HTML("<h4>Employee Information:</h4>"),
        age_slider,
        income_slider,
        department_dropdown,
        job_role_dropdown,
        overtime_dropdown,
        job_satisfaction_slider,
        years_at_company_slider,
        widgets.HTML("<br>"),
        predict_button
    ])

    return widgets.VBox([
        widgets.HTML("<h3>üéØ Attrition Prediction</h3>"),
        widgets.HTML("<p>Adjust the parameters below to predict attrition risk:</p>"),
        widgets.HBox([input_controls, output])
    ])

def create_hr_analytics_ui():
    """
    Create the main HR Analytics UI with tabs
    """
    # Create tabs
    tab = widgets.Tab()

    # Create individual tabs
    data_exploration_tab = create_data_exploration_tab()
    model_training_tab = create_model_training_tab()
    prediction_tab = create_prediction_tab()

    tab.children = [data_exploration_tab, model_training_tab, prediction_tab]
    tab.titles = ['üìä Data Exploration', 'ü§ñ Model Training', 'üéØ Attrition Prediction']

    return tab

print("‚úÖ UI functions defined!")

‚úÖ UI functions defined!


In [22]:
# System status check
print("üîç Checking system status...")

print("1. Libraries imported: ‚úÖ")
print("2. Global variables initialized: ‚úÖ")
print("3. ML functions defined: ‚úÖ")
print("4. UI functions defined: ‚úÖ")

# Check for dataset files
import os
dataset_files = ['HR_IBM_dataset.csv', 'HR-Employee-Attrition.csv', 'WA_Fn-UseC_-HR-Employee-Attrition.csv']
found_files = []

for file in dataset_files:
    if os.path.exists(file):
        found_files.append(file)

if found_files:
    print(f"5. Dataset files found: ‚úÖ {found_files}")
else:
    print("5. Dataset files: ‚ùå Not found (you'll need to upload one)")

print(f"6. Current model trained: {'‚úÖ' if current_model else '‚ùå (use Training tab)'}")

print("\n" + "="*70)

üîç Checking system status...
1. Libraries imported: ‚úÖ
2. Global variables initialized: ‚úÖ
3. ML functions defined: ‚úÖ
4. UI functions defined: ‚úÖ
5. Dataset files found: ‚úÖ ['HR_IBM_dataset.csv']
6. Current model trained: ‚ùå (use Training tab)



# Launch dashboard

In [23]:
# Launch the HR Analytics Dashboard
print("üöÄ Launching HR Analytics & Attrition Prediction System")
print("="*70)
print("SYSTEM OVERVIEW:")
print("‚Ä¢ üìä Data Exploration - Explore dataset statistics and distributions")
print("‚Ä¢ ü§ñ Model Training - Train machine learning model on HR data")
print("‚Ä¢ üéØ Attrition Prediction - Predict employee attrition risk")
print("="*70)
print("WORKFLOW: Explore Data ‚Üí Train Model ‚Üí Make Predictions")
print("="*70)

# Create and display the UI
hr_ui = create_hr_analytics_ui()
display(hr_ui)

print("\n‚úÖ Application launched successfully!")
print("üí° Use the tabs above to navigate through the system")

üöÄ Launching HR Analytics & Attrition Prediction System
SYSTEM OVERVIEW:
‚Ä¢ üìä Data Exploration - Explore dataset statistics and distributions
‚Ä¢ ü§ñ Model Training - Train machine learning model on HR data
‚Ä¢ üéØ Attrition Prediction - Predict employee attrition risk
WORKFLOW: Explore Data ‚Üí Train Model ‚Üí Make Predictions


Tab(children=(VBox(children=(HTML(value='<h3>üìä Data Exploration</h3>'), HTML(value='<p>Click below to explore ‚Ä¶


‚úÖ Application launched successfully!
üí° Use the tabs above to navigate through the system
