In [5]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HR Analytics: Exploratory Data Analysis\n",
    "\n",
    "This notebook explores the HR dataset to identify patterns and factors related to employee attrition."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": int,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Set plot style\n",
    "plt.style.use('ggplot')\n",
    "sns.set(font_scale=1.2)\n",
    "\n",
    "# Display all columns\n",
    "pd.set_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": int,
   "metadata": {},
   "source": [
    "# Load the dataset\n",
    "df = pd.read_csv('../data/hr_data.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": int,
   "metadata": {},
   "source": [
    "# Basic information about the dataset\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(f\"\\nTotal employees: {len(df)}\")\n",
    "print(f\"Attrition rate: {df['Attrition'].value_counts(normalize=True)['Yes']*100:.2f}%\")\n",
    "\n",
    "# Display data types and missing values\n",
    "print(\"\\nData types and missing values:\")\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": int,
   "metadata": {},
   "source": [
    "# Summary statistics for numeric columns\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Attrition Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": int,
   "metadata": {},
   "source": [
    "# Visualize attrition distribution\n",
    "plt.figure(figsize=(8, 6))\n",
    "attrition_counts = df['Attrition'].value_counts()\n",
    "plt.pie(attrition_counts, labels=attrition_counts.index, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff'])\n",
    "plt.title('Employee Attrition Distribution', fontsize=16)\n",
    "plt.axis('equal')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": int,
   "metadata": {},
   "source": [
    "# Attrition by Department\n",
    "plt.figure(figsize=(12, 6))\n",
    "dept_attrition = pd.crosstab(df['Department'], df['Attrition'])\n",
    "dept_attrition.plot(kind='bar', stacked=True)\n",
    "plt.title('Attrition by Department', fontsize=16)\n",
    "plt.xlabel('Department')\n",
    "plt.ylabel('Count')\n",
    "plt.xticks(rotation=45)\n",
    "plt.legend(title='Attrition')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Attrition by Job Role\n",
    "plt.figure(figsize=(14, 6))\n",
    "role_attrition = pd.crosstab(df['JobRole'], df['Attrition'])\n",
    "role_attrition.plot(kind='bar', stacked=True)\n",
    "plt.title('Attrition by Job Role', fontsize=16)\n",
    "plt.xlabel('Job Role')\n",
    "plt.ylabel('Count')\n",
    "plt.xticks(rotation=45)\n",
    "plt.legend(title='Attrition')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Factors Affecting Attrition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Age distribution by attrition\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.histplot(data=df, x='Age', hue='Attrition', bins=20, multiple='stack')\n",
    "plt.title('Age Distribution by Attrition Status', fontsize=16)\n",
    "plt.xlabel('Age')\n",
    "plt.ylabel('Count')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Job satisfaction by attrition\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.countplot(data=df, x='JobSatisfaction', hue='Attrition')\n",
    "plt.title('Job Satisfaction by Attrition Status', fontsize=16)\n",
    "plt.xlabel('Job Satisfaction (1-5 scale)')\n",
    "plt.ylabel('Count')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Monthly income by attrition\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.boxplot(data=df, x='Attrition', y='MonthlyIncome')\n",
    "plt.title('Monthly Income by Attrition Status', fontsize=16)\n",
    "plt.xlabel('Attrition')\n",
    "plt.ylabel('Monthly Income')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Overtime by attrition\n",
    "plt.figure(figsize=(10, 6))\n",
    "overtime_attrition = pd.crosstab(df['OverTime'], df['Attrition'], normalize='index') * 100\n",
    "overtime_attrition.plot(kind='bar', stacked=True)\n",
    "plt.title('Overtime vs Attrition Rate', fontsize=16)\n",
    "plt.xlabel('Overtime')\n",
    "plt.ylabel('Percentage')\n",
    "plt.legend(title='Attrition')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Convert categorical variables to numeric for correlation analysis\n",
    "df_corr = df.copy()\n",
    "df_corr['Attrition'] = df_corr['Attrition'].map({'Yes': 1, 'No': 0})\n",
    "df_corr['OverTime'] = df_corr['OverTime'].map({'Yes': 1, 'No': 0})\n",
    "\n",
    "# Select numeric columns\n",
    "numeric_cols = ['Age', 'JobLevel', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',\n",
    "                'MonthlyIncome', 'PercentSalaryHike', 'JobSatisfaction', 'WorkLifeBalance',\n",
    "                'DistanceFromHome', 'OverTime', 'Attrition']\n",
    "\n",
    "# Calculate correlation matrix\n",
    "correlation = df_corr[numeric_cols].corr()\n",
    "\n",
    "# Plot correlation heatmap\n",
    "plt.figure(figsize=(14, 10))\n",
    "sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')\n",
    "plt.title('Correlation Between Key Factors', fontsize=16)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Key Findings\n",
    "\n",
    "1. **Attrition Rate**: The overall attrition rate is approximately 30%, which is relatively high.\n",
    "\n",
    "2. **Department Impact**: Sales and HR departments show higher attrition rates compared to other departments.\n",
    "\n",
    "3. **Key Factors**: Several factors show strong correlation with attrition:\n",
    "   - Lower job satisfaction is associated with higher attrition\n",
    "   - Lower monthly income correlates with higher attrition\n",
    "   - Employees working overtime have significantly higher attrition rates\n",
    "   - Younger employees tend to leave more frequently\n",
    "   - Employees with longer periods since last promotion show higher attrition\n",
    "\n",
    "4. **Work-Life Balance**: Poor work-life balance appears to be a significant contributor to employee turnover.\n",
    "\n",
    "These insights will inform our predictive modeling approach in the next notebook."
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}



{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# HR Analytics: Exploratory Data Analysis\n',
    '\n',
    'This notebook explores the HR dataset to identify patterns and factors related to employee attrition.']},
  {'cell_type': 'code',
   'execution_count': int,
   'metadata': {},
   'source': ['import pandas as pd\n',
    'import numpy as np\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    '\n',
    '# Set plot style\n',
    "plt.style.use('ggplot')\n",
    'sns.set(font_scale=1.2)\n',
    '\n',
    '# Display all columns\n',
    "pd.set_option('display.max_columns', None)"]},
  {'cell_type': 'code',
   'execution_count': int,
   'metadata': {},
   'source': ['# Load the dataset\n',
    "df = pd.read_csv('../data/hr_data.csv')\n",
    'df.head()']},
  {'cell_type': 'markdown', 'metadata': {}, 'source': ['## Dataset Overview']},
  {'cell_type': 'code',
   'execution_count': int,
   'metadata': {},
   'source': ['# Basic informati