In [2]:
"""
Employee Attrition Analysis
This script implements the execution plan for analyzing employee attrition data and building a predictive model.
"""

# Import required libraries
import pandas as pd
import numpy as np
import altair as alt
from pycaret.classification import *
import os

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Configure Altair
alt.data_transformers.enable('default')
alt.theme.enable('default')

ThemeRegistry.enable('default')

## 1. Setup and Directory Creation

In [3]:
def create_directories():
    """Create necessary directories if they don't exist."""
    directories = ['models', 'plots']
    for directory in directories:
        os.makedirs(directory, exist_ok=True)

# Create directories
create_directories()

## 2. Data Loading and Validation

In [5]:
# Load data
df = pd.read_csv('data/employee_data.csv')

In [16]:
df.shape

(1470, 35)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   EmployeeId                1470 non-null   int64  
 1   Age                       1470 non-null   int64  
 2   Attrition                 1058 non-null   float64
 3   BusinessTravel            1470 non-null   object 
 4   DailyRate                 1470 non-null   int64  
 5   Department                1470 non-null   object 
 6   DistanceFromHome          1470 non-null   int64  
 7   Education                 1470 non-null   int64  
 8   EducationField            1470 non-null   object 
 9   EmployeeCount             1470 non-null   int64  
 10  EnvironmentSatisfaction   1470 non-null   int64  
 11  Gender                    1470 non-null   object 
 12  HourlyRate                1470 non-null   int64  
 13  JobInvolvement            1470 non-null   int64  
 14  JobLevel

In [7]:
df.isna().sum()

EmployeeId                    0
Age                           0
Attrition                   412
BusinessTravel                0
DailyRate                     0
Department                    0
DistanceFromHome              0
Education                     0
EducationField                0
EmployeeCount                 0
EnvironmentSatisfaction       0
Gender                        0
HourlyRate                    0
JobInvolvement                0
JobLevel                      0
JobRole                       0
JobSatisfaction               0
MaritalStatus                 0
MonthlyIncome                 0
MonthlyRate                   0
NumCompaniesWorked            0
Over18                        0
OverTime                      0
PercentSalaryHike             0
PerformanceRating             0
RelationshipSatisfaction      0
StandardHours                 0
StockOptionLevel              0
TotalWorkingYears             0
TrainingTimesLastYear         0
WorkLifeBalance               0
YearsAtC

In [11]:
# Check for duplicate rows
duplicates = df.duplicated()
duplicate_counts = duplicates.value_counts()

# Check for duplicate values in each column and store in a Series
duplicate_by_column = pd.Series({
    column: df[column].duplicated().sum() 
    for column in df.columns 
    if df[column].duplicated().sum() > 0
})

# Display results using display()
display(pd.DataFrame({
    'Duplicate Rows': duplicate_counts,
    'Duplicate Values by Column': duplicate_by_column
}))

Unnamed: 0,Duplicate Rows,Duplicate Values by Column
False,1470.0,
Age,,1427.0
Attrition,,1467.0
BusinessTravel,,1467.0
DailyRate,,584.0
Department,,1467.0
DistanceFromHome,,1441.0
Education,,1465.0
EducationField,,1464.0
EmployeeCount,,1469.0


## 3. Data Cleaning

In [24]:
def clean_data(df):
    """Clean the dataset by handling missing values and adjusting data types."""
    print("\nCleaning data...")
    df_clean = df.copy()
    
    # Handle duplicates
    print("\nChecking for duplicates...")
    # Check for duplicate rows
    duplicate_rows = df_clean.duplicated().sum()
    if duplicate_rows > 0:
        print(f"Found {duplicate_rows} duplicate rows. Removing them...")
        df_clean = df_clean.drop_duplicates()
    
    # Check for duplicate values in each column
    duplicate_by_column = pd.Series({
        column: df_clean[column].duplicated().sum() 
        for column in df_clean.columns 
        if df_clean[column].duplicated().sum() > 0
    })
    if not duplicate_by_column.empty:
        print("\nDuplicate values by column:")
        print(duplicate_by_column)
    
    # Handle Attrition column
    print("\nAttrition values before cleaning:")
    print(df_clean['Attrition'].value_counts(dropna=False))
    
    # If Attrition is not already numeric, convert it
    if df_clean['Attrition'].dtype == 'object':
        df_clean['Attrition'] = df_clean['Attrition'].map({'Yes': 1, 'No': 0})
    
    # Fill missing values with 0 (assuming No as default)
    df_clean['Attrition'] = df_clean['Attrition'].fillna(0)
    
    print("\nAttrition values after cleaning:")
    print(df_clean['Attrition'].value_counts(dropna=False))
    
    # Convert categorical columns to appropriate type
    categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 
                       'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
    for col in categorical_cols:
        df_clean[col] = df_clean[col].astype('category')
    
    # Convert ordinal columns to appropriate type (1-4 or 1-5 scale)
    ordinal_cols = {
        'Education': 'int8',  # 1-Below College to 5-Doctor
        'EnvironmentSatisfaction': 'int8',  # 1-Low to 4-Very High
        'JobInvolvement': 'int8',  # 1-Low to 4-Very High
        'JobLevel': 'int8',  # 1 to 5
        'JobSatisfaction': 'int8',  # 1-Low to 4-Very High
        'PerformanceRating': 'int8',  # 1-Low to 4-Outstanding
        'RelationshipSatisfaction': 'int8',  # 1-Low to 4-Very High
        'StockOptionLevel': 'int8',  # 0 to 3
        'WorkLifeBalance': 'int8'  # 1-Low to 4-Outstanding
    }
    for col, dtype in ordinal_cols.items():
        df_clean[col] = df_clean[col].astype(dtype)
    
    # Convert numeric columns to appropriate types
    df_clean['Age'] = df_clean['Age'].astype('int8')
    df_clean['DailyRate'] = df_clean['DailyRate'].astype('int16')
    df_clean['HourlyRate'] = df_clean['HourlyRate'].astype('int16')
    df_clean['MonthlyIncome'] = df_clean['MonthlyIncome'].astype('int32')
    df_clean['MonthlyRate'] = df_clean['MonthlyRate'].astype('int32')
    df_clean['PercentSalaryHike'] = df_clean['PercentSalaryHike'].astype('int8')
    df_clean['StandardHours'] = df_clean['StandardHours'].astype('int8')
    df_clean['TrainingTimesLastYear'] = df_clean['TrainingTimesLastYear'].astype('int8')
    
    # Convert years-related columns to appropriate types
    years_cols = ['NumCompaniesWorked', 'TotalWorkingYears', 'YearsAtCompany',
                 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
    for col in years_cols:
        df_clean[col] = df_clean[col].astype('int8')
    
    return df_clean

# Clean data
df_clean = clean_data(df)


Cleaning data...

Checking for duplicates...

Duplicate values by column:
Age                         1427
Attrition                   1467
BusinessTravel              1467
DailyRate                    584
Department                  1467
DistanceFromHome            1441
Education                   1465
EducationField              1464
EmployeeCount               1469
EnvironmentSatisfaction     1466
Gender                      1468
HourlyRate                  1399
JobInvolvement              1466
JobLevel                    1465
JobRole                     1461
JobSatisfaction             1466
MaritalStatus               1467
MonthlyIncome                121
MonthlyRate                   43
NumCompaniesWorked          1460
Over18                      1469
OverTime                    1468
PercentSalaryHike           1455
PerformanceRating           1468
RelationshipSatisfaction    1466
StandardHours               1469
StockOptionLevel            1466
TotalWorkingYears           1430
T

In [26]:
df_clean.head(10)

Unnamed: 0,EmployeeId,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1,38,0.0,Travel_Frequently,1444,Human Resources,1,4,Other,1,4,Male,88,3,1,Human Resources,2,Married,2991,5224,0,Y,Yes,11,3,2,80,1,7,2,3,6,2,1,2
1,2,37,1.0,Travel_Rarely,1141,Research & Development,11,2,Medical,1,1,Female,61,1,2,Healthcare Representative,2,Married,4777,14382,5,Y,No,15,3,1,80,0,15,2,1,1,0,0,0
2,3,51,1.0,Travel_Rarely,1323,Research & Development,4,4,Life Sciences,1,1,Male,34,3,1,Research Scientist,3,Married,2461,10332,9,Y,Yes,12,3,3,80,3,18,2,4,10,0,2,7
3,4,42,0.0,Travel_Frequently,555,Sales,26,3,Marketing,1,3,Female,77,3,4,Sales Executive,2,Married,13525,14864,5,Y,No,14,3,4,80,1,23,2,4,20,4,4,8
4,5,40,0.0,Travel_Rarely,1194,Research & Development,2,4,Medical,1,3,Female,98,3,1,Research Scientist,3,Married,2001,12549,2,Y,No,14,3,2,80,3,20,2,3,5,3,0,2
5,6,29,0.0,Travel_Rarely,352,Human Resources,6,1,Medical,1,4,Male,87,2,1,Human Resources,2,Married,2804,15434,1,Y,No,11,3,4,80,0,1,3,3,1,0,0,0
6,7,40,0.0,Travel_Rarely,1124,Sales,1,2,Medical,1,2,Male,57,1,2,Sales Executive,4,Married,7457,13273,2,Y,Yes,22,4,3,80,3,6,2,2,4,3,0,2
7,8,55,1.0,Travel_Rarely,725,Research & Development,2,3,Medical,1,4,Male,78,3,5,Manager,1,Married,19859,21199,5,Y,Yes,13,3,4,80,1,24,2,3,5,2,1,4
8,9,36,0.0,Travel_Frequently,635,Research & Development,18,1,Medical,1,2,Female,73,3,1,Laboratory Technician,4,Single,2153,7703,1,Y,No,13,3,1,80,0,8,2,3,8,1,1,7
9,10,32,0.0,Travel_Rarely,1018,Research & Development,3,2,Life Sciences,1,3,Female,39,3,3,Research Director,4,Single,11159,19373,3,Y,No,15,3,4,80,0,10,6,3,7,7,7,7


## 4. Feature Engineering

In [27]:
def engineer_features(df):
    """Create new features from existing data."""
    print("\nEngineering features...")
    df_fe = df.copy()
    
    # Create age groups
    df_fe['AgeGroup'] = pd.cut(df_fe['Age'], 
                              bins=[0, 25, 35, 45, 55, 100],
                              labels=['18-25', '26-35', '36-45', '46-55', '55+'])
    
    # Calculate tenure ratios (handle division by zero)
    df_fe['TenureRatio'] = df_fe.apply(
        lambda x: x['YearsAtCompany'] / x['TotalWorkingYears'] if x['TotalWorkingYears'] > 0 else 0,
        axis=1
    )
    
    # Create satisfaction index
    satisfaction_cols = ['EnvironmentSatisfaction', 'JobSatisfaction', 
                        'RelationshipSatisfaction', 'WorkLifeBalance']
    df_fe['OverallSatisfaction'] = df_fe[satisfaction_cols].mean(axis=1)
    
    # Create salary-related features
    df_fe['SalaryToAgeRatio'] = df_fe['MonthlyIncome'] / df_fe['Age']
    df_fe['SalaryToTenureRatio'] = df_fe['MonthlyIncome'] / df_fe['YearsAtCompany'].replace(0, 1)
    
    # Create career progression features
    df_fe['PromotionRate'] = df_fe['YearsAtCompany'] / df_fe['YearsSinceLastPromotion'].replace(0, 1)
    df_fe['RoleStability'] = df_fe['YearsInCurrentRole'] / df_fe['YearsAtCompany'].replace(0, 1)
    
    # Create travel impact feature
    df_fe['TravelImpact'] = df_fe['BusinessTravel'].map({
        'Non-Travel': 0,
        'Travel_Rarely': 1,
        'Travel_Frequently': 2
    })
    
    print("\nNew features created:")
    print(df_fe[['AgeGroup', 'TenureRatio', 'OverallSatisfaction', 
                'SalaryToAgeRatio', 'PromotionRate', 'RoleStability']].head())
    
    return df_fe

# Engineer features
df_fe = engineer_features(df_clean)


Engineering features...

New features created:
  AgeGroup  TenureRatio  OverallSatisfaction  SalaryToAgeRatio  PromotionRate  \
0    36-45     0.857143                 2.75         78.710526            6.0   
1    36-45     0.066667                 1.25        129.108108            1.0   
2    46-55     0.555556                 2.75         48.254902            5.0   
3    36-45     0.869565                 3.25        322.023810            5.0   
4    36-45     0.250000                 2.75         50.025000            5.0   

   RoleStability  
0       0.333333  
1       0.000000  
2       0.000000  
3       0.200000  
4       0.600000  


In [28]:
df_fe.head(10)

Unnamed: 0,EmployeeId,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,AgeGroup,TenureRatio,OverallSatisfaction,SalaryToAgeRatio,SalaryToTenureRatio,PromotionRate,RoleStability,TravelImpact
0,1,38,0.0,Travel_Frequently,1444,Human Resources,1,4,Other,1,4,Male,88,3,1,Human Resources,2,Married,2991,5224,0,Y,Yes,11,3,2,80,1,7,2,3,6,2,1,2,36-45,0.857143,2.75,78.710526,498.5,6.0,0.333333,2
1,2,37,1.0,Travel_Rarely,1141,Research & Development,11,2,Medical,1,1,Female,61,1,2,Healthcare Representative,2,Married,4777,14382,5,Y,No,15,3,1,80,0,15,2,1,1,0,0,0,36-45,0.066667,1.25,129.108108,4777.0,1.0,0.0,1
2,3,51,1.0,Travel_Rarely,1323,Research & Development,4,4,Life Sciences,1,1,Male,34,3,1,Research Scientist,3,Married,2461,10332,9,Y,Yes,12,3,3,80,3,18,2,4,10,0,2,7,46-55,0.555556,2.75,48.254902,246.1,5.0,0.0,1
3,4,42,0.0,Travel_Frequently,555,Sales,26,3,Marketing,1,3,Female,77,3,4,Sales Executive,2,Married,13525,14864,5,Y,No,14,3,4,80,1,23,2,4,20,4,4,8,36-45,0.869565,3.25,322.02381,676.25,5.0,0.2,2
4,5,40,0.0,Travel_Rarely,1194,Research & Development,2,4,Medical,1,3,Female,98,3,1,Research Scientist,3,Married,2001,12549,2,Y,No,14,3,2,80,3,20,2,3,5,3,0,2,36-45,0.25,2.75,50.025,400.2,5.0,0.6,1
5,6,29,0.0,Travel_Rarely,352,Human Resources,6,1,Medical,1,4,Male,87,2,1,Human Resources,2,Married,2804,15434,1,Y,No,11,3,4,80,0,1,3,3,1,0,0,0,26-35,1.0,3.25,96.689655,2804.0,1.0,0.0,1
6,7,40,0.0,Travel_Rarely,1124,Sales,1,2,Medical,1,2,Male,57,1,2,Sales Executive,4,Married,7457,13273,2,Y,Yes,22,4,3,80,3,6,2,2,4,3,0,2,36-45,0.666667,2.75,186.425,1864.25,4.0,0.75,1
7,8,55,1.0,Travel_Rarely,725,Research & Development,2,3,Medical,1,4,Male,78,3,5,Manager,1,Married,19859,21199,5,Y,Yes,13,3,4,80,1,24,2,3,5,2,1,4,46-55,0.208333,3.0,361.072727,3971.8,5.0,0.4,1
8,9,36,0.0,Travel_Frequently,635,Research & Development,18,1,Medical,1,2,Female,73,3,1,Laboratory Technician,4,Single,2153,7703,1,Y,No,13,3,1,80,0,8,2,3,8,1,1,7,36-45,1.0,2.5,59.805556,269.125,8.0,0.125,2
9,10,32,0.0,Travel_Rarely,1018,Research & Development,3,2,Life Sciences,1,3,Female,39,3,3,Research Director,4,Single,11159,19373,3,Y,No,15,3,4,80,0,10,6,3,7,7,7,7,26-35,0.7,3.5,348.71875,1594.142857,1.0,1.0,1


## 5. Exploratory Data Analysis

In [29]:
def plot_attrition_by_category(df, column):
    """Plot attrition by category using Altair.
    
    Args:
        df (pd.DataFrame): Input dataframe
        column (str): Column name to plot
    
    Returns:
        alt.Chart: Altair chart object
    """
    try:
        # Validate input
        if column not in df.columns:
            raise ValueError(f"Column '{column}' not found in dataframe")
        
        # Create the chart
        chart = alt.Chart(df).mark_bar().encode(
            x=alt.X(f'{column}:N', title=column, sort='-y'),
            y=alt.Y('count()', title='Count'),
            color='Attrition:N',
            tooltip=['Attrition', 'count()']
        ).properties(
            title=f'Attrition by {column}',
            width=600,
            height=400
        ).interactive()
        
        return chart
        
    except Exception as e:
        print(f"Error creating plot: {str(e)}")
        return None

def create_correlation_heatmap(df):
    """Create correlation heatmap using Altair.
    
    Args:
        df (pd.DataFrame): Input dataframe
    
    Returns:
        alt.Chart: Altair chart object
    """
    try:
        # Get numeric columns
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        if len(numeric_cols) == 0:
            raise ValueError("No numeric columns found in dataframe")
        
        # Calculate correlation matrix
        corr_matrix = df[numeric_cols].corr().reset_index().melt('index')
        
        # Create heatmap
        chart = alt.Chart(corr_matrix).mark_rect().encode(
            x='index:N',
            y='variable:N',
            color=alt.Color('value:Q', scale=alt.Scale(scheme='redblue')),
            tooltip=['index', 'variable', 'value']
        ).properties(
            title='Correlation Matrix',
            width=800,
            height=800
        ).interactive()
        
        return chart
        
    except Exception as e:
        print(f"Error creating correlation heatmap: {str(e)}")
        return None

def plot_satisfaction_distribution(df):
    """Create satisfaction distribution plot using Altair.
    
    Args:
        df (pd.DataFrame): Input dataframe
    
    Returns:
        alt.Chart: Altair chart object
    """
    try:
        # Validate required columns
        required_cols = ['OverallSatisfaction', 'Attrition']
        if not all(col in df.columns for col in required_cols):
            raise ValueError(f"Missing required columns: {required_cols}")
        
        # Create satisfaction distribution
        chart = alt.Chart(df).mark_boxplot().encode(
            y=alt.Y('OverallSatisfaction:Q', title='Overall Satisfaction'),
            x=alt.X('Attrition:N', title='Attrition Status'),
            color='Attrition:N',
            tooltip=['OverallSatisfaction', 'Attrition']
        ).properties(
            title='Satisfaction Distribution by Attrition Status',
            width=400,
            height=300
        ).interactive()
        
        return chart
        
    except Exception as e:
        print(f"Error creating satisfaction distribution plot: {str(e)}")
        return None

def perform_eda(df):
    """Perform exploratory data analysis using Altair.
    
    Args:
        df (pd.DataFrame): Input dataframe
    
    Returns:
        dict: Dictionary containing all created charts
    """
    print("\nPerforming exploratory data analysis...")
    
    # Dictionary to store all charts
    charts = {}
    
    # Plot attrition by various categories
    categories = ['Department', 'JobRole', 'AgeGroup']
    for category in categories:
        chart = plot_attrition_by_category(df, category)
        if chart:
            charts[f'attrition_by_{category.lower()}'] = chart
            display(chart)  # Display chart in notebook
    
    # Create correlation heatmap
    corr_chart = create_correlation_heatmap(df)
    if corr_chart:
        charts['correlation_matrix'] = corr_chart
        display(corr_chart)  # Display chart in notebook
    
    # Create satisfaction distribution
    sat_chart = plot_satisfaction_distribution(df)
    if sat_chart:
        charts['satisfaction_distribution'] = sat_chart
        display(sat_chart)  # Display chart in notebook
    
    return charts

In [31]:
df_fe['Department'].value_counts()

Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64

In [30]:
display(plot_attrition_by_category(df_fe, 'Department'))

In [32]:
display(create_correlation_heatmap(df_fe))

In [33]:
display(plot_satisfaction_distribution(df_fe))

## 6. Model Development

In [None]:
clf = setup(data=df_fe, 
            target='Attrition',
            numeric_features=['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 
                            'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
                            'PercentSalaryHike', 'StandardHours', 'TotalWorkingYears',
                            'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
                            'YearsSinceLastPromotion', 'YearsWithCurrManager',
                            'TenureRatio', 'OverallSatisfaction', 'SalaryToAgeRatio',
                            'SalaryToTenureRatio', 'PromotionRate', 'RoleStability',
                            'TravelImpact'],
            categorical_features=['BusinessTravel', 'Department', 'EducationField',
                                'Gender', 'JobRole', 'MaritalStatus', 'Over18',
                                'OverTime', 'AgeGroup'],
            ordinal_features={
                'Education': [1, 2, 3, 4, 5],
                'EnvironmentSatisfaction': [1, 2, 3, 4],
                'JobInvolvement': [1, 2, 3, 4],
                'JobLevel': [1, 2, 3, 4, 5],
                'JobSatisfaction': [1, 2, 3, 4],
                'PerformanceRating': [1, 2, 3, 4],
                'RelationshipSatisfaction': [1, 2, 3, 4],
                'StockOptionLevel': [0, 1, 2, 3],
                'WorkLifeBalance': [1, 2, 3, 4]
            },
            normalize=True,
            feature_selection=True,
            silent=True,
            session_id=123,
            # Data splitting parameters
            train_size=0.7,  # 70% for training
            test_size=0.15,  # 15% for test (remaining 15% for validation)
            data_split_shuffle=True,  # Randomize the split
            data_split_stratify=True,  # Maintain target distribution in splits
            # Class imbalance handling
            fix_imbalance=True,  # Enable automatic handling of class imbalance
            fix_imbalance_method='smote')  # Use SMOTE for oversampling

In [None]:
# Compare models
best_model = compare_models()

In [None]:
# Tune the best model
tuned_model = tune_model(best_model)

# Finalize the model
final_model = finalize_model(tuned_model)

# Save the model
save_model(final_model, 'models/attrition_model')

# Generate predictions
predictions = predict_model(final_model)

# Create prediction distribution plot
pred_chart = alt.Chart(predictions).mark_bar().encode(
    x=alt.X('prediction_label:N', title='Predicted Attrition'),
    y=alt.Y('count()', title='Count'),
    color='Attrition:N',
    tooltip=['Attrition', 'prediction_label', 'count()']
).properties(
    title='Prediction Distribution',
    width=400,
    height=300
).interactive()

# Display the prediction distribution
display(pred_chart)

# Display model metrics
print("\nModel Performance Metrics:")
print(pull())

## 7. Model Evaluation

In [None]:
def evaluate_model(model, df):
    """Evaluate the model and generate plots."""
    print("\nEvaluating model...")
    
    # Generate predictions
    predictions = predict_model(model)
    
    # Create prediction distribution plot
    pred_chart = alt.Chart(predictions).mark_bar().encode(
        x=alt.X('prediction_label:N', title='Predicted Attrition'),
        y=alt.Y('count()', title='Count'),
        color='Attrition:N',
        tooltip=['Attrition', 'prediction_label', 'count()']
    ).properties(
        title='Prediction Distribution',
        width=400,
        height=300
    ).interactive()
    
    pred_chart.save('plots/prediction_distribution.html')
    
    # Display model metrics
    print("\nModel Performance Metrics:")
    print(pull())

# Evaluate model
evaluate_model(model, df_fe) 