In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
from dash import Dash, dcc, html, Input, Output

# ------------------------------
# 1. Data Preparation
# ------------------------------

# Load the revised dataset (ensure the CSV file is in your working directory)
df = pd.read_csv("Revised_hr_data.csv")

# Remove duplicates
df.drop_duplicates(inplace=True)

# Fill missing values for MonthlyIncome by JobRole median
df['MonthlyIncome'] = df.groupby('JobRole')['MonthlyIncome'].transform(lambda x: x.fillna(x.median()))

# Outlier detection for MonthlyIncome using IQR method
Q1 = df['MonthlyIncome'].quantile(0.25)
Q3 = df['MonthlyIncome'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['MonthlyIncome'] >= lower_bound) & (df['MonthlyIncome'] <= upper_bound)]

# Correct discrepancies between 'Overtime' and 'OvertimeHours'
if 'Overtime' not in df.columns:
    df['Overtime'] = np.where(df['OvertimeHours'] > 0, "Yes", "No")
df['Overtime'] = np.where((df['OvertimeHours'] > 0) & (df['Overtime'] == "No"), "Yes", df['Overtime'])
df['Overtime'] = np.where((df['OvertimeHours'] == 0) | (df['OvertimeHours'].isna()), "No", df['Overtime'])

# ------------------------------
# 2. Interactive Dashboard with Multiple Visuals
# ------------------------------

app = Dash(__name__)

# Define a consistent color mapping for attrition status: "Yes" is red, "No" is blue.
color_map = {'Yes': 'red', 'No': 'blue'}

app.layout = html.Div([
    html.H1("HR Attrition Dashboard", style={'textAlign': 'center'}),
    
    # Global Filters (multi-select for broader analysis)
    html.Div([
        html.Div([
            html.Label("Select Department:"),
            dcc.Dropdown(
                id='dept-dropdown',
                options=[{'label': dept, 'value': dept} for dept in sorted(df['Department'].unique())],
                value=[],  # empty list means no filtering initially
                multi=True,
                placeholder="All Departments"
            ),
        ], style={'width': '45%', 'display': 'inline-block', 'padding': '10px'}),
        
        html.Div([
            html.Label("Select Job Role:"),
            dcc.Dropdown(
                id='jobrole-dropdown',
                options=[{'label': role, 'value': role} for role in sorted(df['JobRole'].unique())],
                value=[],  # no filtering initially
                multi=True,
                placeholder="All Job Roles"
            ),
        ], style={'width': '45%', 'display': 'inline-block', 'padding': '10px'})
    ]),
    
    dcc.Tabs([
        dcc.Tab(label='Attrition Trend', children=[
            dcc.Graph(id='attrition-trend')
        ]),
        dcc.Tab(label='Engagement vs Burnout', children=[
            dcc.Graph(id='engagement-burnout')
        ]),
        dcc.Tab(label='Attrition by Department', children=[
            dcc.Graph(id='attrition-dept')
        ]),
        dcc.Tab(label='Salary Distribution', children=[
            dcc.Graph(id='salary-boxplot')
        ]),
        dcc.Tab(label='Age Distribution', children=[
            dcc.Graph(id='age-hist')
        ]),
        dcc.Tab(label='Correlation Heatmap (Attrition = Yes)', children=[
            dcc.Graph(id='corr-heatmap')
        ]),
        dcc.Tab(label='HR Metrics Box Plots', children=[
            html.Div([
                html.Label("Select HR Metric:"),
                dcc.Dropdown(
                    id='hr-metric-dropdown',
                    options=[
                        {'label': 'Burnout Risk', 'value': 'BurnoutRisk'},
                        {'label': 'Engagement Score', 'value': 'EngagementScore'},
                        {'label': 'Job Satisfaction', 'value': 'JobSatisfaction'},
                        {'label': 'Time Since Last Promotion', 'value': 'TimeSinceLastPromotion'},
                        {'label': 'Commute Time', 'value': 'CommuteTime'}
                    ],
                    value='BurnoutRisk',
                    clearable=False
                ),
                dcc.Graph(id='hr-boxplot')
            ], style={'padding': '20px'})
        ])
    ])
])

# Helper: Filter dataframe based on dropdown selections
def filter_df(dept_list, jobrole_list):
    dff = df.copy()
    if dept_list:
        dff = dff[dff['Department'].isin(dept_list)]
    if jobrole_list:
        dff = dff[dff['JobRole'].isin(jobrole_list)]
    return dff

# Callback: Attrition Trend by YearsAtCompany
@app.callback(
    Output('attrition-trend', 'figure'),
    Input('dept-dropdown', 'value'),
    Input('jobrole-dropdown', 'value')
)
def update_attrition_trend(selected_depts, selected_roles):
    dff = filter_df(selected_depts, selected_roles)
    # Group by YearsAtCompany and Attrition status
    trend_df = dff.groupby(['YearsAtCompany', 'Attrition']).size().reset_index(name='Count')
    fig = px.line(trend_df, x='YearsAtCompany', y='Count', color='Attrition',
                  title="Attrition Trend by Years at Company",
                  markers=True,
                  color_discrete_map=color_map)
    return fig

# Callback: Engagement Score vs. Burnout Risk scatter plot
@app.callback(
    Output('engagement-burnout', 'figure'),
    Input('dept-dropdown', 'value'),
    Input('jobrole-dropdown', 'value')
)
def update_engagement_burnout(selected_depts, selected_roles):
    dff = filter_df(selected_depts, selected_roles)
    fig = px.scatter(dff, x='EngagementScore', y='BurnoutRisk', color='Attrition',
                     title="Engagement Score vs. Burnout Risk",
                     labels={"EngagementScore": "Engagement Score", "BurnoutRisk": "Burnout Risk"},
                     hover_data=['JobRole', 'Department'],
                     color_discrete_map=color_map)
    return fig

# Callback: Attrition count by Department (bar chart)
@app.callback(
    Output('attrition-dept', 'figure'),
    Input('dept-dropdown', 'value'),
    Input('jobrole-dropdown', 'value')
)
def update_attrition_by_dept(selected_depts, selected_roles):
    dff = filter_df(selected_depts, selected_roles)
    count_df = dff.groupby(['Department', 'Attrition']).size().reset_index(name='Count')
    fig = px.bar(count_df, x='Department', y='Count', color='Attrition', barmode='group',
                 title="Attrition Count by Department",
                 color_discrete_map=color_map)
    return fig

# Callback: Salary Distribution Box Plot
@app.callback(
    Output('salary-boxplot', 'figure'),
    Input('dept-dropdown', 'value'),
    Input('jobrole-dropdown', 'value')
)
def update_salary_boxplot(selected_depts, selected_roles):
    dff = filter_df(selected_depts, selected_roles)
    fig = px.box(
        dff, 
        x='JobRole', 
        y='MonthlyIncome', 
        color='Attrition',
        title="Monthly Income Distribution by Job Role",
        labels={"MonthlyIncome": "Monthly Income", "JobRole": "Job Role"},
        color_discrete_map=color_map
    )
    fig.update_traces(boxmean=True)  # Add mean marker in each box plot
    fig.update_layout(xaxis={'categoryorder': 'total descending'})
    return fig


# Callback: Age Distribution Histogram for Attrition = Yes
@app.callback(
    Output('age-hist', 'figure'),
    Input('dept-dropdown', 'value'),
    Input('jobrole-dropdown', 'value')
)
def update_age_hist(selected_depts, selected_roles):
    dff = filter_df(selected_depts, selected_roles)
    # Filter for only rows where Attrition is "Yes"
    dff = dff[dff["Attrition"] == "Yes"]
    fig = px.histogram(
        dff, 
        x='Age', 
        title="Age Distribution for Attrition = Yes",
        opacity=0.7
    )
    # Update all bars to be red
    fig.update_traces(marker_color='red')
    return fig


# Callback: Correlation Heatmap for Selected HR Metrics with Attrition = Yes
@app.callback(
    Output('corr-heatmap', 'figure'),
    Input('dept-dropdown', 'value'),
    Input('jobrole-dropdown', 'value')
)
def update_corr_heatmap(selected_depts, selected_roles):
    dff = filter_df(selected_depts, selected_roles)
    # Filter for only rows where Attrition equals "Yes"
    dff_yes = dff[dff["Attrition"] == "No"]
    
    # Select a subset of key HR metrics
    metrics = ['AttritionRiskIndex', 'EngagementScore', 'JobSatisfaction', 
               'BurnoutRisk', 'CommuteTime', 'TimeSinceLastPromotion', 'MonthlyIncome']
    
    if dff_yes.empty:
        # If no data is available, show a dummy heatmap
        fig = px.imshow(np.array([[np.nan]]), text_auto=True, aspect="auto", 
                        title="No data available for Attrition = Yes")
        return fig

    corr = dff_yes[metrics].corr()
    fig = px.imshow(corr, text_auto=True, aspect="auto", color_continuous_scale='RdBu_r',
                    title="Correlation Heatmap of Selected HR Metrics (Attrition = No)")
    return fig

# Callback: HR Metrics Box Plot (new tab)
@app.callback(
    Output('hr-boxplot', 'figure'),
    Input('hr-metric-dropdown', 'value'),
    Input('dept-dropdown', 'value'),
    Input('jobrole-dropdown', 'value')
)
def update_hr_boxplot(metric, selected_depts, selected_roles):
    dff = filter_df(selected_depts, selected_roles)
    fig = px.box(dff, x='Attrition', y=metric, color='Attrition',
                 title=f"{metric} by Attrition Status",
                 color_discrete_map=color_map)
    return fig


# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)
