# 01_EDA - T-IQ Talent Intelligence System
Purpose: Exploratory Data Analysis (EDA) notebook for all HR datasets:
- Employee Attrition
- Glassdoor Reviews
- HRMS Synthetic Summary
- Job Descriptions
- Resumes
- Interview Transcripts

Outputs (plots, CSV summaries) will be saved to `outputs/eda/`.
Run cells in order. Cells are numbered for easy debugging.


In [26]:
# Cell 1
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
sns.set()


In [27]:
# Cell 2
# Replace these paths with your project structure
DATA_DIR = Path(r"C:\Users\abanu\Documents\t_iq_hr\data\raw")
OUT_DIR = Path(r"C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda")
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Data directory:", DATA_DIR.resolve())
print("Output directory:", OUT_DIR.resolve())


Data directory: C:\Users\abanu\Documents\t_iq_hr\data\raw
Output directory: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda


In [28]:
# Cell 3
def save_fig(fig, name):
    path = OUT_DIR / f"{name}.png"
    fig.savefig(path, bbox_inches='tight')
    print("Saved:", path)

def print_missing(df, name):
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if not missing.empty:
        print(f"Missing values in {name}:")
        display(missing)
        missing.to_csv(OUT_DIR / f"missing_{name}.csv")
        print(f"Saved missing_{name}.csv")
    else:
        print(f"No missing values in {name}.")


In [29]:
# Cell 4
attrition_file = DATA_DIR / "WA_Fn-UseC_-HR-Employee-Attrition.csv"
df_attrition = pd.read_csv(attrition_file)
print("Attrition dataset shape:", df_attrition.shape)
df_attrition.head()


Attrition dataset shape: (1470, 35)


Unnamed: 0,age,attrition_flag,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_count,employee_id,...,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [30]:
# Cell 5
print("Columns:", df_attrition.columns.tolist())
display(df_attrition.info())
display(df_attrition.describe().T)
display(df_attrition.describe(include=['object', 'category']).T)

# Missing values
print_missing(df_attrition, "attrition")


Columns: ['age', 'attrition_flag', 'business_travel', 'daily_rate', 'department', 'distance_from_home', 'education', 'education_field', 'employee_count', 'employee_id', 'environment_satisfaction', 'gender', 'hourly_rate', 'job_involvement', 'job_level', 'job_role', 'job_satisfaction', 'marital_status', 'monthly_income', 'monthly_rate', 'num_companies_worked', 'over_18', 'over_time', 'percent_salary_hike', 'performance_score', 'relationship_satisfaction', 'standard_hours', 'stock_option_level', 'total_working_years', 'training_times_last_year', 'work_life_balance', 'years_at_company', 'years_in_current_role', 'years_since_last_promotion', 'years_with_curr_manager']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   age                         1470 non-null   int64 
 1   attrition_flag              1470 non-null   obj

None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
daily_rate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
distance_from_home,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
employee_count,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
employee_id,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
environment_satisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
hourly_rate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
job_involvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
job_level,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


Unnamed: 0,count,unique,top,freq
attrition_flag,1470,2,No,1233
business_travel,1470,3,Travel_Rarely,1043
department,1470,3,Research & Development,961
education_field,1470,6,Life Sciences,606
gender,1470,2,Male,882
job_role,1470,9,Sales Executive,326
marital_status,1470,3,Married,673
over_18,1470,1,Y,1470
over_time,1470,2,No,1054


No missing values in attrition.


In [33]:
# Debug column names
print("Columns in Attrition CSV:")
for c in df_attrition.columns:
    print(f"'{c}'")


Columns in Attrition CSV:
'age'
'attrition_flag'
'business_travel'
'daily_rate'
'department'
'distance_from_home'
'education'
'education_field'
'employee_count'
'employee_id'
'environment_satisfaction'
'gender'
'hourly_rate'
'job_involvement'
'job_level'
'job_role'
'job_satisfaction'
'marital_status'
'monthly_income'
'monthly_rate'
'num_companies_worked'
'over_18'
'over_time'
'percent_salary_hike'
'performance_score'
'relationship_satisfaction'
'standard_hours'
'stock_option_level'
'total_working_years'
'training_times_last_year'
'work_life_balance'
'years_at_company'
'years_in_current_role'
'years_since_last_promotion'
'years_with_curr_manager'


In [35]:
# Cell 6 — Convert Yes/No to 0/1
df_attrition['attrition_flag'] = df_attrition['attrition_flag'].str.strip().str.lower().map({'yes': 1, 'no': 0})

print("Attrition_flag counts:")
print(df_attrition['attrition_flag'].value_counts())


Attrition_flag counts:
attrition_flag
0    1233
1     237
Name: count, dtype: int64


In [36]:
# Cell 7
numeric_cols = df_attrition.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)

for col in numeric_cols:
    fig, ax = plt.subplots()
    df_attrition[col].hist(bins=30, ax=ax)
    ax.set_title(f"Distribution: {col}")
    save_fig(fig, f"attrition_dist_{col}")
    plt.close(fig)


Numeric columns: ['age', 'attrition_flag', 'daily_rate', 'distance_from_home', 'education', 'employee_count', 'employee_id', 'environment_satisfaction', 'hourly_rate', 'job_involvement', 'job_level', 'job_satisfaction', 'monthly_income', 'monthly_rate', 'num_companies_worked', 'percent_salary_hike', 'performance_score', 'relationship_satisfaction', 'standard_hours', 'stock_option_level', 'total_working_years', 'training_times_last_year', 'work_life_balance', 'years_at_company', 'years_in_current_role', 'years_since_last_promotion', 'years_with_curr_manager']
Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\attrition_dist_age.png
Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\attrition_dist_attrition_flag.png
Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\attrition_dist_daily_rate.png
Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\attrition_dist_distance_from_home.png
Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\

In [37]:
# Cell 8
box_cols = ['MonthlyIncome','TotalWorkingYears','YearsAtCompany','Age']
box_cols = [c for c in box_cols if c in df_attrition.columns]
for col in box_cols:
    fig, ax = plt.subplots()
    df_attrition.boxplot(column=[col], ax=ax)
    ax.set_title(f"Boxplot: {col}")
    save_fig(fig, f"attrition_box_{col}")
    plt.close(fig)


In [38]:
# Cell 9
if len(numeric_cols) >= 2:
    corr = df_attrition[numeric_cols].corr()
    fig, ax = plt.subplots(figsize=(10,8))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag", ax=ax)
    ax.set_title("Correlation matrix")
    save_fig(fig, "attrition_correlation")
    plt.close(fig)


Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\attrition_correlation.png


In [39]:
# Cell 10
cat_cols = df_attrition.select_dtypes(include=['object','category']).columns.tolist()
print("Categorical columns:", cat_cols)

for col in cat_cols:
    vc = df_attrition[col].value_counts()
    vc.to_csv(OUT_DIR / f"attrition_counts_{col}.csv")
    print(f"Saved counts for {col}")


Categorical columns: ['business_travel', 'department', 'education_field', 'gender', 'job_role', 'marital_status', 'over_18', 'over_time']
Saved counts for business_travel
Saved counts for department
Saved counts for education_field
Saved counts for gender
Saved counts for job_role
Saved counts for marital_status
Saved counts for over_18
Saved counts for over_time


In [40]:
# Cell 11
for col in cat_cols:
    try:
        attr_rate = df_attrition.groupby(col)['Attrition_flag'].mean().sort_values(ascending=False)
        attr_rate.to_csv(OUT_DIR / f"attrition_rate_{col}.csv")
        fig, ax = plt.subplots(figsize=(8,4))
        attr_rate.head(20).plot(kind='bar', ax=ax)
        ax.set_title(f"Attrition rate by {col}")
        ax.set_ylabel("Attrition rate")
        save_fig(fig, f"attrition_rate_{col}")
        plt.close(fig)
    except Exception as e:
        print(f"Skipped {col} due to error:", e)


Skipped business_travel due to error: 'Column not found: Attrition_flag'
Skipped department due to error: 'Column not found: Attrition_flag'
Skipped education_field due to error: 'Column not found: Attrition_flag'
Skipped gender due to error: 'Column not found: Attrition_flag'
Skipped job_role due to error: 'Column not found: Attrition_flag'
Skipped marital_status due to error: 'Column not found: Attrition_flag'
Skipped over_18 due to error: 'Column not found: Attrition_flag'
Skipped over_time due to error: 'Column not found: Attrition_flag'


In [41]:
# Cell 12
reviews_file = DATA_DIR / "glassdoor_reviews.csv"
df_reviews = pd.read_csv(reviews_file)
print("Glassdoor reviews shape:", df_reviews.shape)
df_reviews.head()

print_missing(df_reviews, "reviews")


Glassdoor reviews shape: (10000, 10)
No missing values in reviews.


In [42]:
# Cell 13
if 'rating' in df_reviews.columns:
    fig, ax = plt.subplots()
    df_reviews['rating'].hist(bins=10, ax=ax)
    ax.set_title("Rating distribution")
    save_fig(fig, "reviews_rating_dist")
    plt.close(fig)


Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\reviews_rating_dist.png


In [43]:
# Cell 14
hrms_file = DATA_DIR / "hrms_synth_summary.csv"
df_hrms = pd.read_csv(hrms_file)
print("HRMS synthetic summary shape:", df_hrms.shape)
df_hrms.head()

print_missing(df_hrms, "hrms")


HRMS synthetic summary shape: (10000, 11)
No missing values in hrms.


In [44]:
# Cell 15
numeric_hrms = df_hrms.select_dtypes(include=[np.number]).columns.tolist()
cat_hrms = df_hrms.select_dtypes(include=['object','category']).columns.tolist()
print("HRMS numeric columns:", numeric_hrms)
print("HRMS categorical columns:", cat_hrms)

# Save summaries
df_hrms[numeric_hrms].describe().T.to_csv(OUT_DIR / "hrms_numeric_summary.csv")
df_hrms[cat_hrms].describe(include='all').T.to_csv(OUT_DIR / "hrms_categorical_summary.csv")


HRMS numeric columns: ['current_salary', 'satisfaction_score', 'engagement_score', 'num_skills', 'years_at_company', 'trainings_count']
HRMS categorical columns: ['employee_id', 'name', 'department', 'job_role', 'location']


In [45]:
# Cell 16
jobs_file = DATA_DIR / "job_descriptions.csv"
df_jobs = pd.read_csv(jobs_file)
print("Job descriptions shape:", df_jobs.shape)
df_jobs.head()

print_missing(df_jobs, "jobs")


Job descriptions shape: (10000, 12)
No missing values in jobs.


In [46]:
# Cell 17
for col in ['salary_min','salary_max']:
    if col in df_jobs.columns:
        fig, ax = plt.subplots()
        df_jobs[col].dropna().hist(bins=30, ax=ax)
        ax.set_title(f"Distribution: {col}")
        save_fig(fig, f"job_{col}_dist")
        plt.close(fig)


Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\job_salary_min_dist.png
Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\job_salary_max_dist.png


In [49]:
# Safe reading of all resume CSVs
import glob

resume_files = glob.glob(str(DATA_DIR / "resumes" / "**" / "*.csv"), recursive=True)
print("Found resume files:", resume_files)

dfs = []
for f in resume_files:
    try:
        df = pd.read_csv(f, encoding='utf-8', on_bad_lines='skip')
        dfs.append(df)
        print(f"Loaded {f} with shape {df.shape}")
    except Exception as e:
        print(f"Error loading {f}: {e}")

df_resumes = pd.concat(dfs, ignore_index=True)
print("Combined resumes shape:", df_resumes.shape)
df_resumes.head()


Found resume files: ['C:\\Users\\abanu\\Documents\\t_iq_hr\\data\\raw\\resumes\\Resume\\Resume.csv', 'C:\\Users\\abanu\\Documents\\t_iq_hr\\data\\raw\\resumes\\Resume\\Resume_cleaned.csv', 'C:\\Users\\abanu\\Documents\\t_iq_hr\\data\\raw\\resumes\\Resume\\Resume_sample.csv', 'C:\\Users\\abanu\\Documents\\t_iq_hr\\data\\raw\\resumes\\Resume\\Resume_sample_safe.csv']
Loaded C:\Users\abanu\Documents\t_iq_hr\data\raw\resumes\Resume\Resume.csv with shape (2710, 169)
Loaded C:\Users\abanu\Documents\t_iq_hr\data\raw\resumes\Resume\Resume_cleaned.csv with shape (2710, 170)
Error loading C:\Users\abanu\Documents\t_iq_hr\data\raw\resumes\Resume\Resume_sample.csv: Error tokenizing data. C error: EOF inside string starting at row 181
Error loading C:\Users\abanu\Documents\t_iq_hr\data\raw\resumes\Resume\Resume_sample_safe.csv: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Combined resumes shape: (5420, 170)


Unnamed: 0,employee_id,resume_text,resume_html,resume_category,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 160,Unnamed: 161,Unnamed: 162,Unnamed: 163,Unnamed: 164,Unnamed: 165,Unnamed: 166,Unnamed: 167,Unnamed: 168,ID
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,,,,,,,...,,,,,,,,,,


In [50]:
# Keep only relevant columns
useful_cols = ['employee_id', 'resume_text', 'resume_html', 'resume_category', 'ID']
df_resumes = df_resumes[useful_cols]

print("Cleaned resumes shape:", df_resumes.shape)
df_resumes.head()


Cleaned resumes shape: (5420, 5)


Unnamed: 0,employee_id,resume_text,resume_html,resume_category,ID
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,


In [52]:
# Option 1: Rename column to match notebook expectation
df_resumes.rename(columns={
    'resume_text': 'Resume_str',
    'resume_html': 'Resume_html',
    'resume_category': 'Category'
}, inplace=True)

# Now create resume length
df_resumes['resume_len'] = df_resumes['Resume_str'].astype(str).apply(len)

# Plot histogram
fig, ax = plt.subplots()
df_resumes['resume_len'].hist(bins=30, ax=ax)
ax.set_title("Resume length distribution")
save_fig(fig, "resume_len_dist")
plt.close(fig)


Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\resume_len_dist.png


In [53]:
df_resumes


Unnamed: 0,employee_id,Resume_str,Resume_html,Category,ID,resume_len
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,,5442
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,,5572
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,,7720
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,,2855
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,,9172
...,...,...,...,...,...,...
5415,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,2706.0,5533
5416,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION,2707.0,7108
5417,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,2708.0,2020
5418,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,2709.0,5074


In [54]:
# Cell 19
df_resumes['resume_len'] = df_resumes['Resume_str'].astype(str).apply(len)
fig, ax = plt.subplots()
df_resumes['resume_len'].hist(bins=30, ax=ax)
ax.set_title("Resume length distribution")
save_fig(fig, "resume_len_dist")
plt.close(fig)


Saved: C:\Users\abanu\Documents\t_iq_hr\notebooks\outputs\eda\resume_len_dist.png


In [55]:
# Cell 20
transcripts_file = DATA_DIR / "interview_transcripts_100.ndjson"
df_transcripts = pd.read_json(transcripts_file, lines=True)
print("Interview transcripts shape:", df_transcripts.shape)
df_transcripts.head()

print_missing(df_transcripts, "interview_transcripts")


Interview transcripts shape: (100, 10)
No missing values in interview_transcripts.


In [56]:
# Cell 21
json_file = DATA_DIR / "hrms_synth.json"
with open(json_file, 'r', encoding='utf-8') as f:
    hrms_json = json.load(f)

# Flatten JSON
df_hrms_json = pd.json_normalize(hrms_json)
print("Flattened HRMS JSON shape:", df_hrms_json.shape)
df_hrms_json.head()

print_missing(df_hrms_json, "hrms_json")


Flattened HRMS JSON shape: (10000, 26)
Missing values in hrms_json:


manager_id    1561
dtype: int64

Saved missing_hrms_json.csv


In [57]:
# Cell 22
df_attrition.to_csv(OUT_DIR / "snapshot_attrition.csv", index=False)
df_reviews.to_csv(OUT_DIR / "snapshot_reviews.csv", index=False)
df_hrms.to_csv(OUT_DIR / "snapshot_hrms.csv", index=False)
df_jobs.to_csv(OUT_DIR / "snapshot_jobs.csv", index=False)
df_resumes.to_csv(OUT_DIR / "snapshot_resumes.csv", index=False)
df_transcripts.to_csv(OUT_DIR / "snapshot_interviews.csv", index=False)
df_hrms_json.to_csv(OUT_DIR / "snapshot_hrms_json.csv", index=False)

print("All dataset snapshots saved to outputs/eda/")


All dataset snapshots saved to outputs/eda/
