# Submission Pertama: Menyelesaikan Permasalahan Human Resources

- Nama: Bayu Indra Kusuma
- Email: bayuindrakusuma05@gmail.com
- Id Dicoding: bayuik

## Persiapan

### Menyiapkan library yang dibutuhkan

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

### Menyiapkan data yang akan digunakan

## Data Understanding

In [None]:
# Cell 1: Load data dan tampilkan sample
import pandas as pd
data = pd.read_csv('employee_data.csv')
data.head(3)  # Menampilkan 3 baris pertama saja untuk efisiensi

In [None]:
# Cell 2: Analisis struktur data
print("="*50)
print("SHAPE & COLUMNS:")
print(f"Shape: {data.shape}")
print("\nColumns:")
print(data.columns.tolist())  # Lebih rapi sebagai list

In [None]:
# Cell 3: Analisis nilai unik
print("="*50)
print("UNIQUE VALUES (Sorted):")
unique_values = data.nunique()
print(unique_values.sort_values(ascending=False))

In [None]:
# Cell 4: Info data
print("="*50)
print("DATA INFO:")
data.info(verbose=True, show_counts=True)  # Menampilkan detail lengkap

In [None]:
# Cell 5: Eksplorasi kolom kategorikal (optimized)
categorical_columns = data.select_dtypes(include=['object', 'category']).columns

print("="*50)
print("CATEGORICAL FEATURES SUMMARY:")
for col in categorical_columns:
    print(f"\n{col}:")
    print(f"Unique count: {data[col].nunique()}")
    print("Sample values:", data[col].unique()[:5])  # Batasi sampel nilai unik

In [None]:
# Cell 6: Missing values analysis
print("="*50)
print("MISSING VALUES:")
print(data.isnull().sum().sort_values(ascending=False))

In [None]:
# Cell 7: Numerical summary
print("="*50)
print("NUMERICAL FEATURES SUMMARY:")
print(data.describe().T.round(2))  # Transpose dan rounding

In [None]:
# Cell 1: Korelasi numerik (visualisasi saja, tanpa print)
import matplotlib.pyplot as plt
import seaborn as sns

numeric_data = data.select_dtypes(include=['number'])
plt.figure(figsize=(10, 8))
sns.heatmap(
    numeric_data.corr(),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    mask=numeric_data.corr().abs() < 0.3,  # Hanya tampilkan korelasi signifikan
    vmin=-1, vmax=1,
    linewidths=0.5
)
plt.title("Korelasi Numerik (|r| > 0.3)", pad=20, fontsize=14)
plt.tight_layout()
plt.savefig('numeric_correlation.png')  # Simpan untuk dashboard
plt.close()  # Hindari auto-display di notebook

In [None]:
# Cell 2: Data quality check (simpan ke variabel untuk digunakan nanti)
missing_data = data.isnull().sum().sort_values(ascending=False)
duplicate_count = data.duplicated().sum()

# Output hanya jika ada masalah
if missing_data.any():
    print("PERINGATAN: Terdapat missing values")
    print(missing_data[missing_data > 0])

if duplicate_count > 0:
    print(f"\nPERINGATAN: Terdapat {duplicate_count} duplikat data")
else:
    print("\nData quality check: Tidak ditemukan duplikat")

In [None]:
# Cell 3: Analisis target (attrition)
attrition_rate = data['Attrition'].value_counts(normalize=True)
print(f"Attrition Rate: {attrition_rate[1]:.1%}")

plt.figure(figsize=(6,4))
sns.countplot(data=data, x='Attrition', palette='viridis')
plt.title('Distribusi Attrition')
plt.savefig('attrition_dist.png')
plt.close()

In [None]:
eda_results = {
    'correlation_matrix': numeric_data.corr(),
    'missing_data': missing_data,
    'duplicate_count': duplicate_count,
    'attrition_rate': attrition_rate[1]
}

## Data Preparation / Preprocessing

In [None]:
# Cell 1: Handling missing values dan kolom redundant
# Simpan info sebelum cleaning untuk dokumentasi
initial_shape = data.shape
dropped_cols = ['EmployeeCount', 'Over18', 'StandardHours']

data_clean = data.dropna().drop(columns=dropped_cols)
print(f"Data cleaned: {initial_shape} -> {data_clean.shape}")
print(f"Columns dropped: {dropped_cols}")

In [None]:
# Cell 2: Transformasi target variable (Attrition)
data_clean['Attrition'] = data_clean['Attrition'].map({1: 'Yes', 0: 'No'}).astype('category')

In [None]:
# Cell 3: Mapping ordinal features (optimized version)
ordinal_mappings = {
    'Education': {1: 'Below College', 2: 'College', 3: 'Bachelor', 4: 'Master', 5: 'Doctor'},
    'EnvironmentSatisfaction': {1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'},
    'JobSatisfaction': {1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'},
    'PerformanceRating': {1: 'Low', 2: 'Good', 3: 'Excellent', 4: 'Outstanding'},
    'WorkLifeBalance': {1: 'Low', 2: 'Good', 3: 'Excellent', 4: 'Outstanding'}
}

for col, mapping in ordinal_mappings.items():
    data_clean[col] = data_clean[col].map(mapping).astype('category')

In [None]:
# Cell 4: Convert categorical columns (single operation)
cat_cols = ['BusinessTravel', 'Department', 'EducationField',
            'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
data_clean[cat_cols] = data_clean[cat_cols].astype('category')

In [None]:
# Cell 5: Quick summary (combined)
print("\n=== Categorical Features ===")
print(data_clean.describe(include='category').T)

print("\n=== Numerical Features ===")
print(data_clean.describe().round(2))

In [None]:
# Cell 6: Check class imbalance
attrition_dist = data_clean['Attrition'].value_counts(normalize=True)
print(f"\nAttrition Distribution:\n{attrition_dist}")

In [None]:
# Cell 7: Simpan data clean
data_clean.to_csv('hr_data_clean.csv', index=False)
print("\nData cleaned saved for visualization!")

In [None]:
# Cell 8: Contoh encoding untuk model ML (jika diperlukan)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_ml = data_clean.copy()
for col in cat_cols:
    data_ml[col] = le.fit_transform(data_ml[col])

In [None]:
# Cell 1: Employee Count
print(f"Total Employees: {data['EmployeeId'].nunique():,}")

In [None]:
# Menghitung jumlah karyawan berdasarkan 'Attrition' (Aktif vs Keluar)
attrition_comparison = data['Attrition'].value_counts().reset_index()
attrition_comparison.columns = ['Attrition', 'Count']

# Membuat grafik batang interaktif menggunakan Plotly
fig = px.bar(attrition_comparison,
             x='Attrition',
             y='Count',
             color='Attrition',
             color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
             labels={'Attrition': 'Attrition Status', 'Count': 'Number of Employees'},
             title="Perbandingan Karyawan Aktif dan Keluar")

# Menampilkan grafik
fig.show()

In [None]:
# Cell 2: Consolidated Visualization Function
def plot_attrition_comparison(df, group_col, title):
    fig = px.bar(
        df.groupby([group_col, 'Attrition']).size().reset_index(name='Count'),
        x=group_col,
        y='Count',
        color='Attrition',
        barmode='group',
        color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
        title=f"Attrition by {title}",
        labels={'Count': 'Number of Employees'}
    )
    fig.write_image(f"attrition_by_{group_col.lower()}.png")  # Save for Tableau
    return fig

# Generate and save all comparison plots
comparison_plots = {
    'Gender': "Gender",
    'BusinessTravel': "Business Travel Frequency",
    'Department': "Department",
    'JobRole': "Job Role"
}

for col, title in comparison_plots.items():
    plot_attrition_comparison(data, col, title)

In [None]:
# Cell 3: Age Distribution Analysis
px.histogram(
    data,
    x='Age',
    color='Attrition',
    nbins=20,
    barmode='overlay',
    title='Age Distribution by Attrition Status',
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'}
).write_image("age_attrition_dist.png")  # Save for Tableau

In [None]:
# Cell 4: Distance From Home Analysis
px.box(
    data,
    x='Attrition',
    y='DistanceFromHome',
    color='Attrition',
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    title='Distance from Home vs Attrition'
).write_image("distance_attrition.png")

In [None]:
import plotly.express as px
from scipy.stats import ttest_ind

# Rata-rata gaji berdasarkan status Attrition
salary_summary = data.groupby('Attrition')['MonthlyIncome'].describe()
display(salary_summary)

# Membuat histogram untuk MonthlyIncome berdasarkan Attrition
fig = px.histogram(
    data,
    x='MonthlyIncome',
    color='Attrition',
    barmode='overlay',
    title="Monthly Income Distribution by Attrition Status",
    labels={'Attrition': 'Attrition', 'MonthlyIncome': 'Monthly Income'},
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'}
)
fig.show()

# Melakukan uji t-test antara MonthlyIncome Attrition Yes dan No
income_yes = data.loc[data['Attrition'] == 'Yes', 'MonthlyIncome']
income_no = data.loc[data['Attrition'] == 'No', 'MonthlyIncome']
t_stat, p_value = ttest_ind(income_yes, income_no, equal_var=False)  # Tambahkan equal_var=False untuk asumsi unequal variances

print(f"T-Statistic: {t_stat:.4f}, P-Value: {p_value:.4e}")

# Membuat scatter plot interaktif MonthlyIncome vs HourlyRate
fig = px.scatter(
    data,
    x='MonthlyIncome',
    y='HourlyRate',
    color='Attrition',
    symbol='Department',
    title="Relationship between Monthly Income and Hourly Rate by Attrition Status and Department",
    labels={'MonthlyIncome': 'Monthly Income', 'HourlyRate': 'Hourly Rate'}
)
fig.show()

# Menghitung jumlah attrition berdasarkan EducationField
attrition_by_education = (
    data.groupby(['EducationField', 'Attrition'])
        .size()
        .reset_index(name='Count')
)

# Menambahkan kolom Total dan AttritionRate
attrition_by_education['Total'] = attrition_by_education.groupby('EducationField')['Count'].transform('sum')
attrition_by_education['AttritionRate'] = attrition_by_education['Count'] / attrition_by_education['Total']

# Filter hanya Attrition = Yes
attrition_yes = attrition_by_education.query("Attrition == 'Yes'")

# Membuat bar chart Attrition Rate berdasarkan EducationField
fig = px.bar(
    attrition_yes,
    x='EducationField',
    y='AttritionRate',
    color='EducationField',
    title='Attrition Rate by Education Field',
    labels={'AttritionRate': 'Attrition Rate', 'EducationField': 'Education Field'},
    color_discrete_map={
        'Life Sciences': '#FF6347',
        'Other': '#4682B4',
        'Medical': '#FFD700',
        'Marketing': '#32CD32',
        'Technical Degree': '#8A2BE2',
        'Human Resources': '#FF4500'
    }
)
fig.show()

# Menghitung jumlah attrition berdasarkan OverTime
attrition_overtime = (
    data.groupby(['OverTime', 'Attrition'])
        .size()
        .reset_index(name='Count')
)

# Menambahkan kolom Total dan AttritionRate
attrition_overtime['Total'] = attrition_overtime.groupby('OverTime')['Count'].transform('sum')
attrition_overtime['AttritionRate'] = attrition_overtime['Count'] / attrition_overtime['Total']

# Filter hanya Attrition = Yes
attrition_yes_overtime = attrition_overtime.query("Attrition == 'Yes'")

# Membuat bar chart Attrition Rate berdasarkan OverTime
fig = px.bar(
    attrition_yes_overtime,
    x='OverTime',
    y='AttritionRate',
    color='OverTime',
    title='Attrition Rate by Overtime Status',
    labels={'AttritionRate': 'Attrition Rate', 'OverTime': 'Overtime Status'},
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'}
)
fig.show()

# Menampilkan tabel attrition_overtime
display(attrition_overtime)

In [None]:
# Versi final yang sudah diperbaiki dan dioptimalkan
work_life_attrition = (
    data_clean.groupby(['WorkLifeBalance', 'Attrition'], observed=True)  # Tambahkan observed=True
    .size()
    .unstack()
    .assign(
        Total=lambda x: x.sum(axis=1),
        Attrition_Rate=lambda x: x['Yes']/x['Total']
    )
)

# Membuat visualisasi dengan penanganan edge cases
if 'Yes' in work_life_attrition.columns:
    fig = px.bar(
        work_life_attrition.reset_index(),  # Convert index to column
        x='WorkLifeBalance',
        y='Attrition_Rate',
        labels={'WorkLifeBalance': 'Work-Life Balance', 'Attrition_Rate': 'Attrition Rate'},
        title="Attrition Rate by Work-Life Balance",
        color='WorkLifeBalance',
        color_discrete_map={
            'Low': '#FF6347',
            'Good': '#4682B4',
            'Excellent': '#32CD32',
            'Outstanding': '#FFD700'
        },
        text_auto='.1%'  # Menampilkan persentase di atas bar
    )
    fig.update_layout(yaxis_tickformat='.0%')  # Format sumbu Y sebagai persentase
    fig.show()
else:
    print("Tidak ada data Attrition 'Yes' yang ditemukan")

In [None]:
# Grouping data berdasarkan OverTime dan Attrition
overtime_attrition = (
    data.groupby(['OverTime', 'Attrition'])
    .size()
    .reset_index(name='Count')
)

# Membuat bar chart attrition berdasarkan overtime status
fig = px.bar(
    overtime_attrition,
    x='OverTime',
    y='Count',
    color='Attrition',
    title="Attrition Based on Overtime",
    labels={'OverTime': 'Overtime', 'Attrition': 'Attrition', 'Count': 'Count'},
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    barmode='group'
)
fig.show()


In [None]:
# Calculate attrition rate untuk setiap Environment Satisfaction
env_satisfaction_attrition_rate = (
    data_clean.groupby('EnvironmentSatisfaction', observed=True)['Attrition']
    .value_counts(normalize=True)
    .unstack()
    .fillna(0)
)

env_satisfaction_attrition_rate = env_satisfaction_attrition_rate['Yes']

# Membuat bar plot Environment Satisfaction vs Attrition Rate
fig = px.bar(
    env_satisfaction_attrition_rate,
    x=env_satisfaction_attrition_rate.index,
    y=env_satisfaction_attrition_rate.values,
    labels={'x': 'Environment Satisfaction', 'y': 'Attrition Rate'},
    title="Attrition Rate by Environment Satisfaction"
)
fig.show()


In [None]:
# Calculate attrition rate untuk setiap Job Satisfaction
job_satisfaction_attrition_rate = (
    data_clean.groupby('JobSatisfaction', observed=True)['Attrition']
    .value_counts(normalize=True)
    .unstack()
    .fillna(0)
)

job_satisfaction_attrition_rate = job_satisfaction_attrition_rate['Yes']

# Membuat bar plot Job Satisfaction vs Attrition Rate
fig = px.bar(
    job_satisfaction_attrition_rate,
    x=job_satisfaction_attrition_rate.index,
    y=job_satisfaction_attrition_rate.values,
    labels={'x': 'Job Satisfaction', 'y': 'Attrition Rate'},
    title="Attrition Rate by Job Satisfaction"
)
fig.show()


In [None]:
# Membuat histogram distribusi YearsAtCompany berdasarkan Attrition
fig = px.histogram(
    data,
    x='YearsAtCompany',
    color='Attrition',
    barmode='overlay',
    title="Distribution of Work Experience by Attrition",
    labels={'YearsAtCompany': 'Years at Company', 'Attrition': 'Attrition'},
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    nbins=15,
    histfunc='count'
)
fig.show()


In [None]:
# Group employees berdasarkan TotalWorkingYears dan hitung jumlah attrition
attrition_comparison = (
    data_clean.groupby(['TotalWorkingYears', 'Attrition'], observed=True)
    .size()
    .unstack()
    .fillna(0)
)

# Membuat line plot attrition berdasarkan total working years
fig = px.line(
    attrition_comparison,
    x=attrition_comparison.index,
    y=['Yes', 'No'],
    labels={'TotalWorkingYears': 'Total Working Years', 'value': 'Count', 'variable': 'Attrition'},
    title="Attrition Comparison by Work Experience (Total Working Years)",
    markers=True
)
fig.show()


In [None]:
# Hitung jumlah attrition berdasarkan Education Level
attrition_counts_education = (
    data_clean.groupby('Education', observed=True)['Attrition']
    .value_counts()
    .unstack()
    .fillna(0)
)

# Membuat bar plot attrition berdasarkan Education Level
fig = px.bar(
    attrition_counts_education,
    x=attrition_counts_education.index,
    y=['Yes', 'No'],
    labels={'Education': 'Education Level', 'value': 'Count', 'variable': 'Attrition'},
    title="Attrition Count by Education Level",
    color='variable',
    color_discrete_map={'Yes': '#FF6347', 'No': '#4682B4'},
    barmode='group'
)
fig.show()


In [None]:
data.to_csv('employee.csv', index=False)

## Modeling

In [None]:
data.info()

In [None]:
data.drop(columns='EmployeeId',inplace=True)
data

In [None]:
# Mapping metadata untuk mengubah kategori menjadi numerik
mapping_metadata_to_numeric = {
    "Attrition": {"No": 0, "Yes": 1},
    "Education": {"Below College": 1, "College": 2, "Bachelor": 3, "Master": 4, "Doctor": 5},
    "EnvironmentSatisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "JobInvolvement": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "JobSatisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "PerformanceRating": {"Low": 1, "Good": 2, "Excellent": 3, "Outstanding": 4},
    "RelationshipSatisfaction": {"Low": 1, "Medium": 2, "High": 3, "Very High": 4},
    "WorkLifeBalance": {"Low": 1, "Good": 2, "Excellent": 3, "Outstanding": 4}
}

# Konversi kategori menjadi numerik
for col, mapping in mapping_metadata_to_numeric.items():
    if col in data.columns:
        data[col] = data[col].map(mapping).astype("int")

# Periksa tipe data setelah konversi
data

In [None]:
columns = ['BusinessTravel',
    'Department',
    'EducationField',
    'Gender',
    'JobRole',
    'MaritalStatus',
    'OverTime']

# Inisialisasi LabelEncoder
label_encoders = {}

# Proses encoding untuk setiap kolom
for col in columns:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le  # Simpan encoder untuk referensi di masa depan

# Periksa data setelah transformasi
data

In [None]:
x = data.drop('Attrition',axis=1)
y = data['Attrition']

In [None]:
#split data into train and test set.
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [None]:
# Inisialisasi scaler
scaler = MinMaxScaler()

# Fit scaler pada data training dan transformasi
X_train = scaler.fit_transform(X_train)

# Transformasi data testing menggunakan scaler yang sama
X_test = scaler.transform(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
boost = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=1), n_estimators=500, algorithm='SAMME',learning_rate=0.01)

boost.fit(X_train,y_train)

In [None]:
predictions = boost.predict(X_test)

## Evaluation

In [None]:
# Evaluasi model
accuracy = accuracy_score(y_test, predictions)
print(f"Akurasi Model: {accuracy:.2f}")

In [None]:
# Laporan klasifikasi
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, predictions))

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, predictions)

In [None]:
# Visualisasi Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=set(y_test), yticklabels=set(y_test))
plt.title("Confusion Matrix")
plt.xlabel("Prediksi")
plt.ylabel("Aktual")
plt.show()

In [None]:
# Save the model
joblib.dump(boost, 'adaboost_model.pkl')

print("Model saved successfully!")