In [None]:
# EDA.ipynb

# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ----- CONTROL FLAG -----
# Set to True if you want to overwrite the processed CSVs
# Set to False if you want to keep the current X_train/X_test split unchanged
overwrite_files = True  # <<< Change this to True only when you want to re-run preprocessing

# Load data
df = pd.read_csv('full summary.csv')

# Basic overview
print(df.head())
print(df.info())
print(df.describe())

# Check target variable balance
sns.countplot(x='ever_terminated_flag', data=df)
plt.title('Target Variable Distribution')
plt.show()

# Correlation matrix for numeric columns
corr_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Additional EDA plots (example)
# Distribution of age
sns.histplot(df['age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

# Boxplot of base salary by termination status
sns.boxplot(x='ever_terminated_flag', y='base_salary', data=df)
plt.title('Base Salary vs Termination Status')
plt.show()

# Countplot of department by termination status
sns.countplot(x='department', hue='ever_terminated_flag', data=df)
plt.title('Department vs Termination Status')
plt.xticks(rotation=45)
plt.show()
