<a href="https://colab.research.google.com/github/cbonnin88/EDA_Projects/blob/main/performance_review_polars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import polars as pl
import plotly.express as px
import gdown as gd

In [3]:
url = 'https://drive.google.com/uc?id=1CUZ7HDjvh99e6iM78aEFf41ib2fyvsiB'

In [4]:
gd.download(url,'performance_review.csv',quiet=True)
df_performance = pl.read_csv('performance_review.csv')

In [5]:
df_performance.head()

employee_id,department,job_level,tenure_months,gender,age,performance_rating,potential_rating
i64,str,str,i64,str,i64,i64,i64
101,"""Sales""","""T4""",59,"""Female""",63,5,2
102,"""Product""","""T5""",36,"""Female""",41,4,4
103,"""Sales""","""T2""",57,"""Male""",25,2,5
104,"""Engineering""","""T3""",41,"""Female""",59,3,1
105,"""Engineering""","""T3""",22,"""Male""",48,2,2


In [14]:
# Showing missing values
missing_values = df_performance.null_count()
display(missing_values)

employee_id,department,job_level,tenure_months,gender,age,performance_rating,potential_rating
u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0


In [16]:
num_cols = df_performance.shape[0]
num_rows = df_performance.shape[1]

print(f'Number of rows in the data frame: {num_cols}')
print(f'Number of columns in the data frame: {num_rows}')

Number of rows in the data frame: 1000
Number of columns in the data frame: 8


# **Exploratory Data Analysis with Polars**

In [8]:
# Description Statistics
print('\nDescriptive Statistics')
display(df_performance.select(['performance_rating','tenure_months','age']).describe())


Descriptive Statistics


statistic,performance_rating,tenure_months,age
str,f64,f64,f64
"""count""",1000.0,1000.0,1000.0
"""null_count""",0.0,0.0,0.0
"""mean""",3.497,36.755,42.379
"""std""",1.292153,20.188922,12.481312
"""min""",1.0,1.0,21.0
"""25%""",2.0,20.0,32.0
"""50%""",4.0,37.0,42.0
"""75%""",5.0,54.0,53.0
"""max""",5.0,71.0,64.0


In [10]:
# Analyzing performance distribution
performance_distribution = df_performance.group_by('performance_rating').len().sort('performance_rating')

print('\nPerformance Rating Distribution:')
display(performance_distribution)


Performance Rating Distribution:


performance_rating,len
i64,u32
1,69
2,197
3,207
4,222
5,305


In [17]:
# Average Performance by Department
avg_perf_by_dept = (
    df_performance.group_by('department')
    .agg(
        pl.col('performance_rating').mean().round(1).alias('avg_performance'),
    )
    .sort('avg_performance',descending=True)
    )

print('\nAverage Performance by Department:')
display(avg_perf_by_dept)


Average Performance by Department:


department,avg_performance
str,f64
"""Engineering""",3.7
"""Leadership""",3.6
"""Product""",3.5
"""HR""",3.5
"""Marketing""",3.5
"""Data""",3.4
"""Sales""",3.3


In [18]:
# Average performance by job level
avg_perf_by_job_level = (
    df_performance.group_by('job_level')
    .agg(
        pl.col('performance_rating').mean().round(1).alias('avg_performance_job_level')
    )
    .sort('avg_performance_job_level',descending=True)
)
print('\nAverage Performance by Job Level')
display(avg_perf_by_job_level)


Average Performance by Job Level


job_level,avg_performance_job_level
str,f64
"""T1""",3.6
"""T2""",3.5
"""T5""",3.5
"""T3""",3.5
"""T4""",3.4


# **Visualization with Plotly**

In [19]:
df_performance_viz = df_performance.to_pandas()

In [22]:
# Overall Performance Distribution
fig_hist = px.histogram(
    df_performance_viz,
    x='performance_rating',
    title='Distribution of Employee Performance Ratings',
    labels={'performance_ration':'Performance Rating (1-5)'},
    nbins=5,
    text_auto=True
)
fig_hist.update_traces(marker=dict(color='#70aa58'))
fig_hist.show()


In [24]:
# Department Performance Comparison
avg_perf_by_dept_pd = avg_perf_by_dept.to_pandas()

fig_bar = px.bar(
    avg_perf_by_dept_pd,
    x='department',
    y='avg_performance',
    title='Average Performance Rating by Department',
    labels={'department':'Department','avg_performance':'Average Performance Rating'},
    color='department',
    text='avg_performance'
)

fig_bar.update_traces(texttemplate='%{text:.1f}',textposition='outside')
fig_bar.show()

In [26]:
# Job Level Performance Comparison
avg_perf_by_job_level_pd = avg_perf_by_job_level.to_pandas()

fig_bar_2 = px.bar(
   avg_perf_by_job_level_pd,
    x='job_level',
    y='avg_performance_job_level',
    title= 'Average Performance Rating by Job Level',
    labels= {'job_level':'Job Level','avg_performance_job_level':'Average Performance Rating'},
    color='job_level',
    text='avg_performance_job_level'

)

fig_bar_2.update_traces(texttemplate='%{text:.1f}',textposition='inside')
fig_bar_2.show()

In [27]:
# Tenure vs Performance
fig_scatter = px.scatter(
    df_performance_viz,
    x='tenure_months',
    y='performance_rating',
    color='job_level',
    title='Tenure vs Performance Rating by Job Level',
    labels={'tenure_months':'Tenure (Months)','performance_rating':'Performance Rating'}

)
fig_scatter.show()

In [28]:
# Investigating Potential Bias

fig_box = px.box(
    df_performance_viz,
    x='gender',
    y='performance_rating',
    color='gender',
    title='Performance Ration Distribution by Gender',
    labels={'gender':'Gender','performance_rating':'Performance Rating'}
)

fig_box.show()