# Submission Pertama: Menyelesaikan Permasalahan Human Resources

- Nama: Bayu Indra Kusuma
- Email: bayuindrakusuma05@gmail.com
- Id Dicoding: bayuik

## Persiapan

### Menyiapkan library yang dibutuhkan

In [None]:
# Data handling
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Statistical analysis
from scipy.stats import ttest_ind

# Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Modeling
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Utilities
import joblib

### Menyiapkan data yang akan digunakan

## Data Understanding

In [None]:
df = pd.read_csv('employee_data.csv')
df.head()

In [None]:
df.columns.tolist()

In [None]:
unique_counts = df.nunique().sort_values(ascending=False)
unique_counts

In [None]:
df.info()

In [None]:
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    print(f"### {col}")
    for val in df[col].unique():
        print(f"- {val}")
    print()

In [None]:
num_df = df.select_dtypes(include=['int64', 'float64'])
corr = num_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar=True)
plt.title("Korelasi Fitur Numerik", fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='Attrition', data=df, palette='magma', hue=df['Attrition'])
plt.title('Distribusi Karyawan yang Resign vs Tidak')
plt.xticks([0, 1], ['Tidak Resign', 'Resign'])
plt.xlabel('Attrition')
plt.ylabel('Jumlah')
plt.show()

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols].boxplot(figsize=(15, 5), rot=90)
plt.title('Boxplot untuk Deteksi Outlier')
plt.show()


In [None]:
corr_target = df[num_cols].corr()['Attrition'].sort_values(ascending=False)
corr_target

In [None]:
df.describe().T.sort_values(by='std', ascending=False)

In [None]:

num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols].boxplot(figsize=(15, 5), rot=90)
plt.title('Boxplot untuk Deteksi Outlier')
plt.show()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.duplicated().sum()

## Data Preparation / Preprocessing

In [None]:
# Menghapus missing value (jika ada)
df.dropna(inplace=True)

# Menghapus kolom yang tidak memberikan informasi (konstan atau tidak relevan)
df.drop(columns=['EmployeeCount', 'Over18', 'StandardHours'], inplace=True)


In [None]:
# Menampilkan 5 data pertama setelah pembersihan awal
df.head()

In [None]:
# Mengubah nilai kolom 'Attrition' dari 0 dan 1 menjadi 'Yes' dan 'No'
df['Attrition'] = df['Attrition'].map({1: 'Yes', 0: 'No'})
df['Attrition'] = df['Attrition'].astype('category')
df['Attrition'].unique()


In [None]:
# Mapping untuk kolom ordinal
ordinal_map = {
    'Education': {1: 'Below College', 2: 'College', 3: 'Bachelor', 4: 'Master', 5: 'Doctor'},
    'EnvironmentSatisfaction': {1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'},
    'JobInvolvement': {1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'},
    'JobSatisfaction': {1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'},
    'PerformanceRating': {1: 'Low', 2: 'Good', 3: 'Excellent', 4: 'Outstanding'},
    'RelationshipSatisfaction': {1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'},
    'WorkLifeBalance': {1: 'Low', 2: 'Good', 3: 'Excellent', 4: 'Outstanding'}
}

# Terapkan mapping dan ubah ke kategori
for col, mapping in ordinal_map.items():
    df[col] = df[col].map(mapping).astype('category')


In [None]:
# Kolom kategorikal eksplisit
explicit_categoricals = [
    'BusinessTravel', 'Department', 'EducationField',
    'Gender', 'JobRole', 'MaritalStatus', 'OverTime'
]

# Ubah tipe data ke kategori
for col in explicit_categoricals:
    df[col] = df[col].astype('category')


In [None]:
# Deskripsi fitur kategorikal
df.describe(include='category').T

In [None]:
# Deskripsi fitur numerik
df.describe(exclude='category').T


In [None]:
# Visualisasi semua kategori dalam satu figure
plt.figure(figsize=(15, 20))
for i, col in enumerate(kategori_cols, 1):
    plt.subplot(4, 2, i)
    sns.countplot(data=df, x=col, hue='Attrition',
                palette={'Yes': '#FF6347', 'No': '#4682B4'})
    plt.title(f'Attrition by {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Hitung total karyawan unik
total_employees = df['EmployeeId'].nunique()
print(f"Jumlah karyawan: {total_employees}")

In [None]:
# Hitung persentase attrition
attr_by_travel = df.groupby(['BusinessTravel', 'Attrition']).size().unstack()
attr_by_travel['Attrition_Rate'] = (attr_by_travel['Yes'] /
                                  (attr_by_travel['Yes'] + attr_by_travel['No'])) * 100

fig = px.bar(
    attr_by_travel,
    x=attr_by_travel.index,
    y='Attrition_Rate',
    color_discrete_sequence=['#FF6347'],
    labels={'x': 'Business Travel', 'y': 'Attrition Rate (%)'},
    title="Persentase Attrition Berdasarkan Business Travel"
)
fig.show()

In [None]:
# Hitung persentase attrition
attr_by_travel = df.groupby(['BusinessTravel', 'Attrition']).size().unstack()
attr_by_travel['Attrition_Rate'] = (attr_by_travel['Yes'] /
                                  (attr_by_travel['Yes'] + attr_by_travel['No'])) * 100

fig = px.bar(
    attr_by_travel,
    x=attr_by_travel.index,
    y='Attrition_Rate',
    color_discrete_sequence=['#FF6347'],
    labels={'x': 'Business Travel', 'y': 'Attrition Rate (%)'},
    title="Persentase Attrition Berdasarkan Business Travel"
)
fig.show()

In [None]:
fig = px.box(
    df,
    x="Attrition",
    y="DistanceFromHome",
    color="Attrition",
    color_discrete_map={"Yes": "#FF6347", "No": "#4682B4"},
    title="Distribusi Jarak dari Rumah",
    points="all"
)
fig.update_layout(yaxis_title="Jarak dari Rumah (km)")
fig.show()

In [None]:
# Analisis gender yang lebih komprehensif
gender_analysis = (df.groupby('Gender')['Attrition']
                  .value_counts(normalize=True)
                  .mul(100)
                  .rename('Percentage')
                  .reset_index()
                  .query("Attrition == 'Yes'"))

fig = px.bar(
    gender_analysis,
    x='Gender',
    y='Percentage',
    color='Gender',
    text='Percentage',
    title='Attrition Rate by Gender',
    labels={'Percentage': 'Attrition Rate (%)'},
    color_discrete_sequence=['#FF6347', '#4682B4']
)
fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
# Analisis kelompok umur lebih detail
df['AgeGroup'] = pd.cut(df['Age'], bins=[17, 25, 35, 45, 55, 65],
                       labels=['18-25', '26-35', '36-45', '46-55', '56-65'])

age_analysis = (df.groupby('AgeGroup')['Attrition']
                .value_counts(normalize=True)
                .mul(100)
                .rename('Percentage')
                .reset_index()
                .query("Attrition == 'Yes'"))

fig = px.bar(
    age_analysis,
    x='AgeGroup',
    y='Percentage',
    color='AgeGroup',
    text='Percentage',
    title='Attrition Rate by Age Group',
    labels={'Percentage': 'Attrition Rate (%)'},
    color_discrete_sequence=px.colors.sequential.Reds
)
fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.show()

# Tambahkan boxplot untuk distribusi usia
fig = px.box(df, x='Attrition', y='Age', color='Attrition',
             color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
             title='Age Distribution by Attrition Status')
fig.show()

In [None]:
# Cross analysis gender dan usia
cross_analysis = (df.groupby(['Gender', 'AgeGroup'])['Attrition']
                  .value_counts(normalize=True)
                  .mul(100)
                  .rename('Percentage')
                  .reset_index()
                  .query("Attrition == 'Yes'"))

fig = px.bar(
    cross_analysis,
    x='AgeGroup',
    y='Percentage',
    color='Gender',
    barmode='group',
    title='Attrition Rate by Age Group and Gender',
    labels={'Percentage': 'Attrition Rate (%)'},
    color_discrete_sequence=['#FF6347', '#4682B4']
)
fig.show()

In [None]:
# Format output yang lebih profesional
print("\n=== ATTRITION ANALYSIS REPORT ===")
print(f"\nOverall Attrition Rate: {attrition_rate:.1f}%")

print("\nTop Risk Factors:")
for col in kategori_cols:
    if col != 'Attrition':
        rate = df.groupby(col)['Attrition'].value_counts(normalize=True).xs('Yes', level=1)
        top3 = rate.sort_values(ascending=False).head(3)
        print(f"\n- {col}:")
        for val, pct in top3.items():
            print(f"  {val}: {pct*100:.1f}%")

In [None]:
# Heatmap untuk faktor kategorikal
cat_attrition = pd.DataFrame()

for col in kategori_cols:
    if col != 'Attrition':
        rates = df.groupby(col)['Attrition'].value_counts(normalize=True).xs('Yes', level=1)*100
        cat_attrition[col] = rates

plt.figure(figsize=(12, 8))
sns.heatmap(cat_attrition.T.sort_values(by=cat_attrition.columns[0], ascending=False),
            annot=True, fmt=".1f", cmap="YlOrRd")
plt.title("Attrition Rates Across Categories (%)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 1. Analisis Komprehensif Job Role
jobrole_analysis = (df.groupby('JobRole')['Attrition']
                    .agg(['count', lambda x: (x == 'Yes').sum()])
                    .rename(columns={'count': 'Total', '<lambda_0>': 'AttritionCount'})
                    .assign(AttritionRate=lambda x: (x['AttritionCount']/x['Total'])*100)
                    .sort_values('AttritionRate', ascending=False)
                    .reset_index())

# Visualisasi Job Role Analysis
fig = px.bar(jobrole_analysis,
             x='AttritionRate',
             y='JobRole',
             orientation='h',
             color='AttritionRate',
             color_continuous_scale='Reds',
             title='Attrition Rate by Job Role (Ordered by Highest Risk)',
             labels={'AttritionRate': 'Attrition Rate (%)', 'JobRole': 'Job Role'},
             hover_data=['Total', 'AttritionCount'])

fig.update_layout(coloraxis_showscale=False)
fig.show()

In [None]:
# 2. Analisis Komparasi Gaji
salary_comparison = (df.groupby(['JobRole', 'Attrition'])['MonthlyIncome']
                     .agg(['mean', 'median', 'count'])
                     .reset_index())

# Visualisasi Perbandingan Gaji
fig = px.bar(salary_comparison,
             x='JobRole',
             y='mean',
             color='Attrition',
             barmode='group',
             title='Salary Comparison by Job Role and Attrition Status',
             labels={'mean': 'Average Monthly Income ($)', 'JobRole': 'Job Role'},
             hover_data=['median', 'count'],
             color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'})

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()


In [None]:
# 3. Analisis Gabungan (Bubble Chart)
combined_analysis = pd.merge(
    jobrole_analysis,
    salary_comparison[salary_comparison['Attrition']=='Yes'][['JobRole', 'mean']],
    on='JobRole'
).rename(columns={'mean': 'AvgSalary'})

fig = px.scatter(combined_analysis,
                 x='AvgSalary',
                 y='AttritionRate',
                 size='Total',
                 color='JobRole',
                 hover_name='JobRole',
                 title='Attrition Risk vs Salary by Job Role',
                 labels={'AvgSalary': 'Average Salary ($)',
                        'AttritionRate': 'Attrition Rate (%)',
                        'Total': 'Total Employees'})

fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.show()

In [None]:
# 4. Analisis Statistik
print("\n=== JOB ROLE INSIGHTS ===")
print(f"Highest Attrition Rate: {jobrole_analysis.iloc[0]['JobRole']} ({jobrole_analysis.iloc[0]['AttritionRate']:.1f}%)")
print(f"Lowest Attrition Rate: {jobrole_analysis.iloc[-1]['JobRole']} ({jobrole_analysis.iloc[-1]['AttritionRate']:.1f}%)")

salary_gap = (salary_comparison[salary_comparison['Attrition']=='No']['mean'].mean() -
              salary_comparison[salary_comparison['Attrition']=='Yes']['mean'].mean())
print(f"\nAverage Salary Difference (Stayers vs Leavers): ${salary_gap:.2f}")

In [None]:
# 1. Analisis Overtime yang Lebih Komprehensif
overtime_analysis = (df.groupby(['OverTime', 'Attrition'])
                     .size()
                     .unstack()
                     .assign(AttritionRate=lambda x: x['Yes']/(x['Yes']+x['No'])*100)
                     .reset_index())

# Visualisasi Overtime Analysis
fig = px.bar(overtime_analysis,
             x='OverTime',
             y='AttritionRate',
             color='OverTime',
             text='AttritionRate',
             title='Attrition Rate by Overtime Status',
             labels={'AttritionRate': 'Attrition Rate (%)', 'OverTime': 'Overtime Status'},
             color_discrete_sequence=['#FF6347', '#4682B4'])

fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.update_layout(showlegend=False)
fig.show()


In [None]:
# 2. Analisis Gabungan Overtime dan Income
fig = px.box(df,
             x='OverTime',
             y='MonthlyIncome',
             color='Attrition',
             facet_col='Attrition',
             title='Income Distribution by Overtime and Attrition Status',
             labels={'MonthlyIncome': 'Monthly Income ($)'},
             color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'})

fig.update_layout(boxmode='group')
fig.show()

In [None]:
print("\n=== OVERTIME & INCOME INSIGHTS ===")

# Uji statistik perbedaan gaji
from scipy.stats import mannwhitneyu
overtime_yes = df[df['OverTime']=='Yes']
overtime_no = df[df['OverTime']=='No']

stat, p = mannwhitneyu(overtime_yes['MonthlyIncome'], overtime_no['MonthlyIncome'])
print(f"\nIncome difference (Overtime vs Non-Overtime):")
print(f"Median Overtime: ${overtime_yes['MonthlyIncome'].median():.2f}")
print(f"Median Non-Overtime: ${overtime_no['MonthlyIncome'].median():.2f}")
print(f"Significant? {'Yes' if p < 0.05 else 'No'} (p={p:.4f})")

# Analisis interaksi Overtime-Attrition-Income
interaction = (df.groupby(['OverTime', 'Attrition'])['MonthlyIncome']
               .agg(['median', 'count'])
               .reset_index()
               .sort_values(['OverTime', 'Attrition']))

print("\nIncome Medians by Overtime and Attrition:")
print(interaction.to_string(index=False))

In [None]:
# 4. Visualisasi Interaktif Gabungan
fig = px.scatter(df,
                 x='MonthlyIncome',
                 y='YearsAtCompany',
                 color='Attrition',
                 facet_col='OverTime',
                 title='Income vs Tenure by Overtime and Attrition Status',
                 labels={'MonthlyIncome': 'Monthly Income ($)',
                        'YearsAtCompany': 'Years at Company'},
                 color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
                 hover_data=['JobRole', 'Age'])

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()

In [None]:
# Enhanced Statistical Analysis
print("=== INCOME ANALYSIS ===")
print(f"Mean income (Attrition=Yes): ${income_yes.mean():.2f}")
print(f"Mean income (Attrition=No): ${income_no.mean():.2f}")
print(f"Difference: ${income_no.mean()-income_yes.mean():.2f}")
print(f"T-Statistic: {t_stat:.2f}, P-Value: {p_value:.4f}")
print("Conclusion: The income difference is", "statistically significant" if p_value < 0.05 else "not statistically significant")

# Enhanced Scatter Plot
fig = px.scatter(df,
                 x='MonthlyIncome',
                 y='HourlyRate',
                 color='Attrition',
                 symbol='Department',
                 size='YearsAtCompany',  # Added size dimension
                 hover_data=['JobRole', 'Age', 'EducationField'],  # More context
                 title="Income vs Hourly Rate by Department and Attrition Status",
                 labels={'MonthlyIncome': 'Monthly Income ($)',
                        'HourlyRate': 'Hourly Rate ($)'},
                 color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'})

fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02))
fig.show()


In [None]:
attrition_by_education = (df.groupby('EducationField')['Attrition']
                          .value_counts(normalize=True)
                          .mul(100)
                          .rename('Percentage')
                          .reset_index()
                          .query("Attrition == 'Yes'")
                          .sort_values('Percentage', ascending=False))

# Add count information
counts = df['EducationField'].value_counts().rename('TotalCount').reset_index()
attrition_by_education = pd.merge(attrition_by_education, counts, on='EducationField')

fig = px.bar(attrition_by_education,
             x='EducationField',
             y='Percentage',
             color='EducationField',
             text='Percentage',
             title='Attrition Rate by Education Field (with Sample Sizes)',
             labels={'Percentage': 'Attrition Rate (%)'},
             hover_data=['TotalCount'],
             color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.update_layout(showlegend=False, xaxis_title=None)
fig.show()

In [None]:
# Additional Insight: Education vs Income
fig = px.box(df,
             x='EducationField',
             y='MonthlyIncome',
             color='Attrition',
             title='Income Distribution by Education Field and Attrition',
             labels={'MonthlyIncome': 'Monthly Income ($)'},
             color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'})

fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
# 1. Comprehensive Overtime Analysis
overtime_analysis = (df.groupby(['OverTime', 'Attrition'])
                     .size()
                     .unstack()
                     .assign(
                         Total=lambda x: x['Yes'] + x['No'],
                         AttritionRate=lambda x: x['Yes']/(x['Yes']+x['No'])*100
                     )
                     .reset_index())

# Enhanced Overtime Visualization
fig = px.bar(overtime_analysis,
             x='OverTime',
             y='AttritionRate',
             color='OverTime',
             text='AttritionRate',
             title='Attrition Rate by Overtime Status (with Counts)',
             labels={'AttritionRate': 'Attrition Rate (%)'},
             hover_data=['Yes', 'No', 'Total'],
             color_discrete_sequence=['#FF6347', '#4682B4'])

fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.update_layout(showlegend=False, uniformtext_minsize=8)
fig.show()

In [None]:
# 2. Satisfaction Analysis (Combined Plot)
from plotly.subplots import make_subplots
import plotly.graph_objects as go

satisfaction_metrics = ['EnvironmentSatisfaction', 'JobSatisfaction']

fig = make_subplots(rows=1, cols=2,
                   subplot_titles=("Environment Satisfaction", "Job Satisfaction"))

for i, metric in enumerate(satisfaction_metrics, 1):
    satisfaction_df = (df.groupby(metric)['Attrition']
                      .value_counts(normalize=True)
                      .mul(100)
                      .rename('Percentage')
                      .reset_index()
                      .query("Attrition == 'Yes'"))

    fig.add_trace(
        go.Bar(
            x=satisfaction_df[metric],
            y=satisfaction_df['Percentage'],
            name=metric,
            marker_color='#FF6347',
            text=satisfaction_df['Percentage'].round(1)
        ),
        row=1, col=i
    )

fig.update_layout(
    title_text="Attrition Rate by Satisfaction Levels",
    showlegend=False,
    yaxis_title="Attrition Rate (%)",
    height=500
)
fig.update_traces(texttemplate='%{text}%', textposition='outside')
fig.show()

In [None]:
import plotly.express as px

# Melt the dataframe for facet plotting
melted_df = df.melt(id_vars=['Attrition'],
                   value_vars=['EnvironmentSatisfaction', 'JobSatisfaction'],
                   var_name='SatisfactionType',
                   value_name='SatisfactionLevel')

# Calculate percentages
plot_df = (melted_df.groupby(['SatisfactionType', 'SatisfactionLevel', 'Attrition'])
           .size()
           .groupby(level=[0,1])
           .apply(lambda x: 100 * x / x.sum())
           .reset_index(name='Percentage')
           .query("Attrition == 'Yes'"))

# Create facet plot
fig = px.bar(plot_df,
             x='SatisfactionLevel',
             y='Percentage',
             facet_col='SatisfactionType',
             title='Attrition Rate by Satisfaction Levels',
             labels={'Percentage': 'Attrition Rate (%)'},
             color_discrete_sequence=['#FF6347'])

fig.update_traces(texttemplate='%{y:.1f}%', textposition='outside')
fig.update_layout(showlegend=False)
fig.show()

In [None]:

# 3. Combined Overtime-Satisfaction Analysis
fig = px.sunburst(
    df,
    path=['OverTime', 'JobSatisfaction', 'Attrition'],
    color='Attrition',
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    title="Attrition Breakdown by Overtime and Job Satisfaction",
    width=700, height=700
)
fig.show()

In [None]:
# 1. Enhanced Satisfaction Analysis (Combined Plot)
from plotly.subplots import make_subplots
import plotly.graph_objects as go

satisfaction_metrics = ['EnvironmentSatisfaction', 'JobSatisfaction']

fig = make_subplots(rows=1, cols=2,
                   subplot_titles=("Environment Satisfaction", "Job Satisfaction"),
                   shared_yaxes=True)

for i, metric in enumerate(satisfaction_metrics, 1):
    # Calculate attrition rates
    satisfaction_df = (df.groupby(metric)['Attrition']
                      .value_counts(normalize=True)
                      .mul(100)
                      .rename('Percentage')
                      .reset_index()
                      .query("Attrition == 'Yes'"))

    fig.add_trace(
        go.Bar(
            x=satisfaction_df[metric],
            y=satisfaction_df['Percentage'],
            name=metric.replace('Satisfaction', ''),
            marker_color='#FF6347',
            text=satisfaction_df['Percentage'].round(1),
            textposition='outside'
        ),
        row=1, col=i
    )

fig.update_layout(
    title_text="Attrition Rates by Satisfaction Levels",
    showlegend=False,
    yaxis_title="Attrition Rate (%)",
    height=500,
    uniformtext_minsize=8
)
fig.show()

In [None]:

# 3. Additional Analysis: Satisfaction vs Experience
fig = px.scatter(
    df,
    x='YearsAtCompany',
    y='JobSatisfaction',
    color='Attrition',
    facet_col='Attrition',
    title="Job Satisfaction vs Years at Company",
    labels={'YearsAtCompany': 'Years at Company', 'JobSatisfaction': 'Job Satisfaction'},
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    trendline="lowess"
)
fig.show()

In [None]:
# 1. Enhanced Experience Distribution Analysis
fig = px.histogram(
    df,
    x='YearsAtCompany',
    color='Attrition',
    facet_col='Attrition',
    barmode='overlay',
    title="Detailed Work Experience Distribution by Attrition Status",
    labels={'YearsAtCompany': 'Years at Company'},
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    nbins=20,
    marginal='box',
    opacity=0.7,
    hover_data=['Department', 'JobRole', 'MonthlyIncome'],
    histnorm='percent'  # Show percentage instead of count
)

# Add statistical annotations
for status, color in [('Yes', '#FF6347'), ('No', '#4682B4')]:
    subset = df[df['Attrition'] == status]
    mean_val = subset['YearsAtCompany'].mean()
    median_val = subset['YearsAtCompany'].median()

    fig.add_annotation(
        x=mean_val, y=10,
        text=f"Mean: {mean_val:.1f} yrs<br>Median: {median_val:.1f} yrs",
        showarrow=True,
        arrowhead=1,
        ax=0,
        ay=-40,
        bgcolor=color,
        opacity=0.8
    )

fig.update_layout(bargap=0.1, height=500)
fig.show()

In [None]:

# 2. Enhanced Work Experience Trend Analysis
experience_metrics = ['YearsAtCompany', 'TotalWorkingYears']

fig = make_subplots(rows=1, cols=2,
                   subplot_titles=("Years at Current Company", "Total Working Years"))

for i, metric in enumerate(experience_metrics, 1):
    temp_df = df.groupby([metric, 'Attrition']).size().unstack().fillna(0)

    for status, color in [('Yes', '#FF6347'), ('No', '#4682B4')]:
        fig.add_trace(
            go.Scatter(
                x=temp_df.index,
                y=temp_df[status],
                name=f"{status} Attrition",
                line=dict(color=color),
                mode='lines+markers',
                hovertemplate=f"{metric}: %{{x}}<br>Count: %{{y}}"
            ),
            row=1, col=i
        )

fig.update_layout(
    title_text="Attrition Trends by Work Experience Metrics",
    yaxis_title="Employee Count",
    height=500,
    hovermode='x unified'
)
fig.show()

In [None]:

# 3. Enhanced Education Analysis with Attrition Rates
education_order = ['Below College', 'College', 'Bachelor', 'Master', 'Doctor']

fig = px.bar(
    df.groupby('Education')['Attrition']
      .value_counts(normalize=True)
      .mul(100)
      .rename('Percentage')
      .reset_index()
      .query("Attrition == 'Yes'")
      .sort_values('Education', key=lambda x: pd.Categorical(x, categories=education_order, ordered=True)),
    x='Education',
    y='Percentage',
    color='Education',
    title="Attrition Rate by Education Level",
    labels={'Percentage': 'Attrition Rate (%)'},
    text='Percentage',
    color_discrete_sequence=px.colors.sequential.Reds
)

fig.update_traces(
    texttemplate='%{text:.1f}%',
    textposition='outside',
    marker_line_color='black',
    marker_line_width=0.5
)

fig.update_layout(
    showlegend=False,
    xaxis_title="Education Level",
    yaxis_range=[0, df['Attrition'].eq('Yes').mean()*100*2]  # Scale to 2x overall rate
)
fig.show()

In [None]:
 #Combined Education-Experience Analysis
fig = px.box(
    df,
    x='Education',
    y='YearsAtCompany',
    color='Attrition',
    points='all',
    title="Work Experience Distribution by Education and Attrition",
    labels={'YearsAtCompany': 'Years at Company'},
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    category_orders={'Education': education_order}
)

fig.update_layout(
    boxmode='group',
    height=600,
    hovermode='closest'
)
fig.show()

In [None]:
df.to_csv('employee.csv', index=False)

## Modeling

In [None]:
df.info()

In [None]:
df.drop(columns='EmployeeId',inplace=True)
df

In [None]:
# Mapping metadata untuk mengubah kategori menjadi numerik
mapping_metadata_to_numeric = {
    "Attrition": {"No": 0, "Yes": 1},
    "Education": {"Below College": 1, "College": 2, "Bachelor": 3, "Master": 4, "Doctor": 5},
    "EnvironmentSatisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "JobInvolvement": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "JobSatisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "PerformanceRating": {"Low": 1, "Good": 2, "Excellent": 3, "Outstanding": 4},
    "RelationshipSatisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "WorkLifeBalance": {"Low": 1, "Good": 2, "Excellent": 3, "Outstanding": 4}
}

# Konversi kategori menjadi numerik
for col, mapping in mapping_metadata_to_numeric.items():
    if col in data.columns:
        data[col] = data[col].map(mapping).astype("int")

# Periksa tipe data setelah konversi
data

In [None]:
columns = ['BusinessTravel',
    'Department',
    'EducationField',
    'Gender',
    'JobRole',
    'MaritalStatus',
    'OverTime']

# Inisialisasi LabelEncoder
label_encoders = {}

# Proses encoding untuk setiap kolom
for col in columns:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le  # Simpan encoder untuk referensi di masa depan

# Periksa data setelah transformasi
data

In [None]:
x = data.drop('Attrition',axis=1)
y = data['Attrition']

In [None]:
#split data into train and test set.
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [None]:
# Inisialisasi scaler
scaler = MinMaxScaler()

# Fit scaler pada data training dan transformasi
X_train = scaler.fit_transform(X_train)

# Transformasi data testing menggunakan scaler yang sama
X_test = scaler.transform(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
boost = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=1), n_estimators=500, algorithm='SAMME',learning_rate=0.01)

boost.fit(X_train,y_train)

In [None]:
predictions = boost.predict(X_test)

## Evaluation

In [None]:
# Evaluasi model
accuracy = accuracy_score(y_test, predictions)
print(f"Akurasi Model: {accuracy:.2f}")

In [None]:
# Laporan klasifikasi
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, predictions))

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, predictions)

In [None]:
# Visualisasi Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=set(y_test), yticklabels=set(y_test))
plt.title("Confusion Matrix")
plt.xlabel("Prediksi")
plt.ylabel("Aktual")
plt.show()

In [None]:
# Save the model
joblib.dump(boost, 'adaboost_model.pkl')

print("Model saved successfully!")