<a href="https://colab.research.google.com/github/cbonnin88/starfield_industries/blob/main/employee_distribution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import polars as pl
import plotly.express as px
import gdown as gd

In [None]:
url = 'https://drive.google.com/uc?id=1pAE3Knjo4JSSY4vVi8DwBFkIEU6vlRDT'
gd.download(url,'starfield_HR.csv',quiet=True)

df_starfield = pl.read_csv('starfield_HR.csv')

In [None]:
df_starfield.head()

ID,Name,Surname,Age,Tenure,Hire Date,Gender,Region,Job Title,Department,Manager,Hours,Salary Band,Salary,Performance,Satisfaction
i64,str,str,i64,i64,str,str,str,str,str,str,i64,str,i64,str,f64
4045,"""Gwendolyn""","""Turner""",18,0,"""2025-05-07""","""Female""","""ile-de-France""","""Automation Engineer""","""Engineering""","""no""",8,"""T5""",13600,"""Average""",3.0
5780,"""Jason""","""Peterson""",20,1,"""2024-09-01""","""Male""","""ile-de-France""","""Release Engineer""","""Engineering""","""no""",8,"""T5""",14688,"""Average""",5.0
9231,"""Max""","""Hopkins""",19,1,"""2024-02-03""","""Male""","""ile-de-France""","""Support Engineer""","""Engineering""","""no""",8,"""T5""",14786,"""Average""",3.0
11163,"""Vanesa""","""Saldaña""",18,0,"""2025-06-17""","""Male""","""ile-de-France""","""Quantum Engineer""","""Engineering""","""no""",8,"""T5""",15439,"""Average""",0.0
7851,"""Amador""","""Roybal""",54,2,"""2023-02-11""","""Male""","""ile-de-France""","""Financial Planning & Analysis …","""Finance""","""no""",8,"""T5""",22427,"""Average""",5.0


In [None]:
# Cleaning up the titles
original_column = df_starfield.columns
new_columns = [col.lower().replace(' ','_').replace('.','').replace('/','_') for col in original_column]
df_starfield = df_starfield.rename({old: new for old, new in zip(original_column, new_columns)})

df_starfield.columns

['id',
 'name',
 'surname',
 'age',
 'tenure',
 'hire_date',
 'gender',
 'region',
 'job_title',
 'department',
 'manager',
 'hours',
 'salary_band',
 'salary',
 'performance',
 'satisfaction']

In [None]:
# Convert 'hire_date' to Date type
if 'hire_date' in df_starfield.columns and df_starfield['hire_date'].dtype == pl.Utf8:
    try:
        df_starfield = df_starfield.with_columns(
            pl.col('hire_date').str.to_date('%Y-%m-%d', strict=False).alias('hire_date')
        )
        print("\nConverted 'hire_date' to Date type.")
    except Exception as e:
        print(f"Could not convert 'hire_date' to Date: {e}")


df_starfield.dtypes


Converted 'hire_date' to Date type.


[Int64,
 String,
 String,
 Int64,
 Int64,
 Date,
 String,
 String,
 String,
 String,
 String,
 Int64,
 String,
 Int64,
 String,
 Float64]

# **Number of Employees by Department**

In [None]:
department_counts = df_starfield.group_by('department').agg(pl.len().alias('employee_count')).sort('employee_count',descending=True)

department_counts

department,employee_count
str,u32
"""Engineering""",222
"""Product & Tech""",204
"""Sales""",186
"""Marketing""",88
"""Finance""",47
"""Human Resources""",30
"""Leadership""",20


In [None]:
fig = px.bar(
    department_counts,
    x='department',
    y='employee_count',
    title='Number of Employees per department',
    labels={'department':'Department'},
    text='employee_count',
    color='department'
)

fig.show()

# **Employees by Gender**

In [None]:
gender_counts = df_starfield.group_by('gender').agg(pl.len().alias('employee_count')).sort('employee_count',descending=True)
gender_counts

gender,employee_count
str,u32
"""Male""",439
"""Female""",358


In [None]:
fig = px.bar(
    gender_counts,
    x='gender',
    y='employee_count',
    title='Number of Employees by Gender',
    labels={'department':'Department'},
    text='employee_count',
    color='gender'
)

fig.show()

# **Gender by Department**

In [None]:
gender_counts_by_dept = df_starfield.group_by(['gender','department']).agg(pl.len().alias('employee_count')).sort('employee_count',descending=True)
gender_counts_by_dept

gender,department,employee_count
str,str,u32
"""Male""","""Engineering""",137
"""Male""","""Product & Tech""",106
"""Male""","""Sales""",103
"""Female""","""Product & Tech""",98
"""Female""","""Engineering""",85
…,…,…
"""Male""","""Finance""",23
"""Male""","""Human Resources""",18
"""Female""","""Leadership""",12
"""Female""","""Human Resources""",12


In [None]:
fig_stacked = px.bar(
    gender_counts_by_dept,
    x='department',
    y='employee_count',
    color='gender',
    barmode='stack',
    labels={'department':'Department','employee_count':'Number of Employees'},
    text='employee_count',
    title='Number of Employees by Gender in each Department'
)

fig_stacked.show()

In [None]:
# Average Age in the company
avg_age = df_starfield.select(pl.col("age").mean().round())

avg_age

age
f64
42.0


In [None]:
# Average Age based on Gender
avg_age_by_gender = df_starfield.group_by("gender").agg(pl.col("age").mean().round(0))
avg_age_by_gender

gender,age
str,f64
"""Male""",42.0
"""Female""",42.0


In [None]:
fig = px.histogram(
    df_starfield,
    x='age',
    nbins=20,
    title='Employee Age Distribution',
    labels={'age':'Age'},
    color_discrete_sequence=px.colors.qualitative.Plotly,
    text_auto=True
)
fig.update_layout(bargap=0.1)
fig.show()