In [None]:
# HR Attrition Analysis Project

# ðŸ“Œ Step 1: Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ðŸ“Œ Step 2: Load Dataset
df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.shape, df.columns

# ðŸ“Œ Step 3: Basic Data Cleaning
df.drop(['EmployeeNumber', 'EmployeeCount', 'Over18', 'StandardHours'], axis=1, inplace=True)
df.isnull().sum()

# ðŸ“Œ Step 4: EDA - Visualize Attrition Counts
plt.figure(figsize=(6,4))
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Count')
plt.show()

# ðŸ“Œ Step 5: EDA - Attrition by Department
plt.figure(figsize=(8,5))
sns.countplot(x='Department', hue='Attrition', data=df)
plt.title('Attrition by Department')
plt.show()

# ðŸ“Œ Step 6: Convert Categorical to Numeric
le = LabelEncoder()
for col in df.select_dtypes(include='object'):
    df[col] = le.fit_transform(df[col])

# ðŸ“Œ Step 7: Define Features and Target
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# ðŸ“Œ Step 8: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ðŸ“Œ Step 9: Train Model (Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# ðŸ“Œ Step 10: Evaluate Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ðŸ“Œ Step 11: Feature Importance (Optional)
coeff_df = pd.DataFrame(model.coef_[0], index=X.columns, columns=['Coefficient'])
coeff_df.sort_values(by='Coefficient', ascending=False).head(10)
