In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

In [None]:
from dash import Dash, html, dcc
from jupyter_dash import JupyterDash
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate

In [None]:
data = pd.read_csv('hr_dashboard_data.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.head()

In [None]:
#General EDA of Dataset

In [None]:
#Top 10 Employyes in terms of highest {variable} that not str

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'),
    dcc.Dropdown(id='dropdown',
                options=[x for x in data.columns if data[x].dtype != 'O' and x != 'Age'], 
                value='Projects Completed'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(variable):
    df = data.sort_values(variable, ascending=False)[:10]
    fig = px.bar(
        df, 
        x=variable, 
        y='Name',
        text_auto=True,
        color='Feedback Score',
        title=f'Top 10 Employees with Highest {variable}',
        labels={'Name':'Employee'}
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)\
     .add_vline(
                x=data[variable].mean(),
                line_dash='dash',
                line_color='grey')

    return fig 

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8080)

In [None]:
#Position/Gender/Distribution Value Counts
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'),
    dcc.Dropdown(id='dropdown',
                options=[x for x in data.columns if data[x].dtype == 'O' and x != 'Joining Date' and x!='Name'], 
                value='Gender'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(variable):
    df = data[variable].value_counts().reset_index()
    fig = px.pie(
        df, 
        names='index', 
        values=variable,
        title=f"{variable} Distribution "
    ).update_traces(textposition='inside', textinfo='percent+label')\
     .update_layout(
            title={
            'x':0.48,
            'y':0.92,
            'xanchor':'center'},
        width=900, height = 550)
    
    return fig 

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8081)

In [None]:
# Histograms
for x in data.columns:
    if data[x].dtype != 'O':
        sns.histplot(data[x], kde=True)
        plt.title(f'{x}')
        plt.show()
        

In [None]:
for x in data.columns:
    if data[x].dtype != 'O':
        sns.kdeplot(x=data['Age'], shade=True)
        plt.title(f'{x}')
        plt.show()

In [None]:
data

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'), 
    dcc.Dropdown(id='dropdown', 
                options=[x for x in data.columns if data[x].dtype != 'O'], 
                value='Age'), 
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(variable):
    fig = px.histogram(
        data,
        x=variable,
#         nbins=30,
        text_auto=True,
#         histnorm='percent'  
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8082)

In [None]:
#Salary Histograms for Differnet Position 

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Position Selection:'), 
    dcc.Dropdown(id='dropdown', 
                options=data.Position.unique(), 
                value='Analyst'), 
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(position):
    df = data.query(f'Position == "{position}"')
    
    fig = px.histogram(
        df,
        x='Salary',
#         nbins=30,
        text_auto=True,
    )
    
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8083)

In [None]:
#Salary Histograms for Differnet Department 

app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Position Selection:'), 
    dcc.Dropdown(id='dropdown', 
                options=data.Department.unique(), 
                value='Marketing'), 
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(department):
    df = data.query(f'Department == "{department}"')
    
    fig = px.histogram(
        df,
        x='Salary',
#         nbins=30,
        text_auto=True,
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8083)

In [None]:
#Gender Analysis

#Avg {column} among different gender
data.groupby('Gender')[[x for x in data.columns if data[x].dtype != 'O']].mean()

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Selection:'),
    dcc.Dropdown(id='dropdown',
                options=[x for x in data.columns if data[x].dtype != 'O'], 
                value='Projects Completed'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(variable):
    df = data.groupby('Gender', as_index=False).mean()
    fig = px.bar(
        df, 
        x='Gender', 
        y=variable,
        text_auto=True,
        color='Gender',
        title=f'Avg {variable} of Each Gender',
#         labels={'Name':'Employee'}
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)\
#      .add_vline(
#                 x=data[variable].mean(),
#                 line_dash='dash',
#                 line_color='grey')

    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8084)

In [None]:
#Avg {column} among different gender in different department
data.groupby(['Gender', 'Department'], as_index=True).mean()

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Gender Selection:'),
    dcc.RadioItems(id='items', 
                  options=['Male', 'Female'],
                  value='Male'),
    html.Br(),
    dcc.Dropdown(id='dropdown', 
                options=[x for x in data.columns if data[x].dtype!='O'], 
                value='Age'),
    dcc.Graph('visual')
])

@app.callback(Output('visual','figure'), Input('items', 'value'), Input('dropdown', 'value'))
def plot(gender, variable):
    df = data.groupby(['Gender', 'Department'], as_index=False).mean().query(f"Gender == '{gender}'")\
             .sort_values(variable, ascending=False)
    
    fig = px.bar(
        df, 
        x='Department',
        y=variable,
        color='Department',
        title = f"AVG {variable} among {gender} Employees across Different Apartments",
        labels={'Name':'Company'},
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8085)
    

In [None]:
#Avg {column} among different gender in different positions
data.groupby(['Gender', 'Position'], as_index=True).mean()

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Gender Selection:'),
    dcc.RadioItems(id='items', 
                  options=['Male', 'Female'],
                  value='Male'),
    html.Br(),
    dcc.Dropdown(id='dropdown', 
                options=[x for x in data.columns if data[x].dtype!='O'], 
                value='Age'),
    dcc.Graph('visual')
])

@app.callback(Output('visual','figure'), Input('items', 'value'), Input('dropdown', 'value'))
def plot(gender, variable):
    df = data.groupby(['Gender', 'Position'], as_index=False).mean().query(f"Gender == '{gender}'")\
             .sort_values(variable, ascending=False)
    
    fig = px.bar(
        df, 
        x='Position',
        y=variable,
        color='Position',
        title = f"AVG {variable} among {gender} Employees across Different Positions",
        labels={'Name':'Company'},
        text_auto=True
    ).update_traces(showlegend=False)\
     .update_layout(
                title={
                'x':0.5,
                'y':0.88,
                'xanchor':'center'},
                height=500)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8086)

In [None]:
metric('median')

In [None]:
#Avg {variable} in each department 
data.groupby('Department', as_index=True).agg(['mean', 'median', 'sum'])

In [None]:
app = JupyterDash(__name__)

values = ['mean', 'median', 'sum']

app.layout = html.Div([
    html.H3("Department/Metric Selection:"),
    dcc.Dropdown(id='dropdown',
                 options=[x for x in data.columns if data[x].dtype!='O'],
                 value='Age'),
    html.Br(),
    dcc.RadioItems(id='items',
                   options=[{'label':x.title(), 'value':x} for x in values],
                   value='mean'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('items', 'value'), Input('dropdown', 'value'))
def plot(metric_value, variable):
    df = data.groupby('Department', as_index=False).agg({variable:metric_value}).sort_values(variable, ascending=False)
    
    fig = px.bar(
        df, 
        y=variable, 
        x='Department',
        title=f'{variable} ({metric_value.title()})',
        color='Department',
        text_auto=True
    ).update_traces(showlegend=False)\

    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8087)

In [None]:
#Avg/Sum/Median {variable} in each department in each position
data.groupby(['Department', 'Position'], as_index=True).agg(['mean', 'median', 'sum'])

In [None]:
app = JupyterDash(__name__)

values = ['mean', 'median', 'sum']

app.layout = html.Div([
    html.H3('Department Selection:'),
    dcc.RadioItems(id='items1',
                   options=data.Department.unique(),
                   value='Marketing'),
    html.H3('Metric Selection:'),
    dcc.RadioItems(id='items2',
                   options=[{'label':x.title(), 'value':x} for x in values],
                   value='mean'),
    html.Br(),
    dcc.Dropdown(id='dropdown',
                 options=[x for x in data.columns if data[x].dtype != 'O'],
                 value='Age'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'),
              Input('items1', 'value'),
              Input('dropdown', 'value'),
              Input('items2', 'value'))
def plot(department, variable, metric):
    
    df = data.groupby(['Department', 'Position'], as_index=False)\
              .agg({variable:metric})\
              .query(f"Department == '{department}'")
    
    fig = px.bar(
        df.sort_values(variable, ascending=False), 
        y=variable, 
        x='Position',
        color='Position',
        text_auto=True,
        title=f'{variable} ({metric.title()})',
    ).update_traces(showlegend=False)\

    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8088)

In [None]:
#Joining Date Analysis 

In [None]:
def date(data):
    blanks = []
    for x in data['Joining Date'].str.split('-'):
        if int(x[1]) < 22:
            blanks.append('20'+str(x[1]))
        else:
            blanks.append('19'+str(x[1]))
    
    return blanks

In [None]:
data['Joining Date']=date(data)

In [None]:
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown',
                 options=[x for x in data.columns if data[x].dtype != 'O' and x != 'Age'],
                 value='Salary'),
    html.Br(),
    dcc.RadioItems(id='items',
                   options=[{'label':x.title(), 'value':x} for x in values],
                   value='mean'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'), Input('items', 'value'))
def plot(column, metric):
    df = data.groupby('Joining Date', as_index=False).agg({column:metric}).sort_values('Joining Date')
    
    fig = px.line(
        df, 
        x='Joining Date',
        y=column,
        title=f'Avg {column} Salary in each Year'
    )
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8089)

In [None]:
#Correlation

In [None]:
data.corr()

In [None]:
sns.pairplot(data)

In [None]:
#Correlation between each variable
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Column Selection:'),
    dcc.Dropdown(id='dropdown1',
                 options=data.corr().columns,
                 value='Salary'),
    html.Br(),
    dcc.Dropdown(id='dropdown2',
                 options=data.corr().columns,
                 value='Projects Completed'),
    html.Br(),
    dcc.RadioItems(id='items', 
                  options=['Trendline Off', 'Trendline On'], 
                  value='Trendline Off'),
    dcc.Graph(id='visual') 
])

@app.callback(Output('visual', 'figure'), 
              Input('dropdown1', 'value'), 
              Input('dropdown2', 'value'),
              Input('items', 'value'))
def plot(x, y, switch):
    fig = px.scatter(
        data,
        x=x,
        y=y,
        color='Gender',
        size='Salary',
        trendline=None if switch == 'Trendline Off' else 'ols',
        title=f'{x.title()} vs. {y.title()}'
    ).update_layout(width=800, height = 600)
    
    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=8090)

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
df = data.copy()

In [None]:
for x in df.columns:
    if data[x].dtype == 'O':
        df[x] = label_encoder.fit_transform(df[x])

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), annot=True)

In [None]:
#Machine Learning 

In [None]:
df.head()

In [None]:
def display_scores(scores):
    print('===============================================')
    print('Scores: {}'.format(scores))
    print('===============================================')
    print('Mean Score: {}'.format(scores.mean()))
    print('===============================================')
    print('Standard Deviation of Scores: {}'.format(scores.std()))
    print('===============================================')
    
    return None

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Salary', axis=1)
y = df.loc[:, 'Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [None]:
linear = LinearRegression()

linear.fit(X_train, y_train)

In [None]:
pred = linear.predict(X_test)

In [None]:
mean_squared_error(pred, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(linear, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)

rand_rmse_in_sample_estimates = np.sqrt(-scores)

display_scores(rand_rmse_in_sample_estimates)

In [None]:
sns.scatterplot(
    x=y_test, 
    y=pred,
    hue=None,
    alpha=0.6
)
plt.xlabel('Actual Salary'),
plt.ylabel('Pred Salary')
plt.title('Pred Salary vs. Actual Salary');

In [None]:
sns.kdeplot(x=pred, shade=True)
sns.kdeplot(x=y_test, shade=True)
plt.title('Pred Salary vs. Actual Salary');

In [None]:
tree = DecisionTreeRegressor()

tree.fit(X_train, y_train)

In [None]:
pred = tree.predict(X_test)

In [None]:
mean_squared_error(pred, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)

rand_rmse_in_sample_estimates = np.sqrt(-scores)

display_scores(rand_rmse_in_sample_estimates)

In [None]:
sns.scatterplot(
    x=y_test, 
    y=pred,
    hue=None,
    alpha=0.6
)
plt.xlabel('Actual Salary'),
plt.ylabel('Pred Salary')
plt.title('Pred Salary vs. Actual Salary');

In [None]:
sns.kdeplot(x=y_test, shade=True)
sns.kdeplot(x=pred, shade=True)
plt.title('Pred Salary vs. Actual Salary');

In [None]:
forest = RandomForestRegressor()

forest.fit(X_train, y_train)

In [None]:
pred = forest.predict(X_test)

In [None]:
mean_squared_error(pred, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)

rand_rmse_in_sample_estimates = np.sqrt(-scores)

display_scores(rand_rmse_in_sample_estimates)

In [None]:
sns.scatterplot(
    x=y_test, 
    y=pred,
    hue=None,
    alpha=0.6
)
plt.xlabel('Actual Salary'),
plt.ylabel('Pred Salary')
plt.title('Pred Salary vs. Actual Salary');

In [None]:
sns.kdeplot(x=y_test, shade=True)
sns.kdeplot(x=pred, shade=True)
plt.title('Pred Salary vs. Actual Salary');