# 01 â€” Data Ingestion & Overview
Notes:
- Objective: locally stage raw datasets, inspect columns and sample rows, basic cleaning utilities.
- Outputs: processed master table saved to data/processed/master_table.parquet


In [1]:
# Notebook 01: Data Loading & Initial Cleaning (Polished)
# T-IQ Project

import pandas as pd
from pathlib import Path
import json
import re

# Set paths
RAW = Path(r"C:\Users\abanu\Documents\T-IQ\data\raw")
PROCESSED = Path(RAW.parent / "processed")
PROCESSED.mkdir(exist_ok=True, parents=True)

print("Raw data path:", RAW)
print("Processed data path:", PROCESSED)


Raw data path: C:\Users\abanu\Documents\T-IQ\data\raw
Processed data path: C:\Users\abanu\Documents\T-IQ\data\processed


In [2]:
attrition_path = RAW / "WA_Fn-UseC_-HR-Employee-Attrition.csv"

if attrition_path.exists():
    attrition = pd.read_csv(attrition_path)
    print("Attrition dataset loaded, shape:", attrition.shape)
else:
    print("Attrition dataset not found!")


Attrition dataset loaded, shape: (1470, 35)


In [3]:
# Basic info
attrition.info()
attrition.describe()

# Missing values
print("Missing values per column:\n", attrition.isnull().sum())

# Drop duplicates
attrition = attrition.drop_duplicates()

# Map Attrition to 0/1
attrition['Attrition'] = attrition['Attrition'].map({'Yes':1, 'No':0})

# Preview unique categorical values
for col in ['Department','JobRole','EducationField','Gender','MaritalStatus']:
    if col in attrition.columns:
        print(f"{col} unique values: {attrition[col].unique()}")

# Save processed
attrition.to_csv(PROCESSED / "attrition_clean.csv", index=False)

# Optional: save sample for quick testing
attrition.sample(100).to_csv(PROCESSED / "attrition_sample.csv", index=False)
print("Attrition cleaned and saved.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [4]:
hrms_summary_path = RAW / "hrms_synth_summary.csv"

if hrms_summary_path.exists():
    hrms_summary = pd.read_csv(hrms_summary_path)
    print("HRMS summary loaded, shape:", hrms_summary.shape)
else:
    print("HRMS summary dataset not found!")


HRMS summary loaded, shape: (10000, 11)


In [5]:
# Basic info
hrms_summary.info()
print("Missing values:\n", hrms_summary.isnull().sum())

# Drop duplicates
hrms_summary = hrms_summary.drop_duplicates()

# Preview top columns
print(hrms_summary.head())

# Save processed
hrms_summary.to_csv(PROCESSED / "hrms_summary_clean.csv", index=False)
print("HRMS summary cleaned and saved.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   employee_id         10000 non-null  object 
 1   name                10000 non-null  object 
 2   department          10000 non-null  object 
 3   job_role            10000 non-null  object 
 4   location            10000 non-null  object 
 5   current_salary      10000 non-null  int64  
 6   satisfaction_score  10000 non-null  float64
 7   engagement_score    10000 non-null  float64
 8   num_skills          10000 non-null  int64  
 9   years_at_company    10000 non-null  int64  
 10  trainings_count     10000 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 859.5+ KB
Missing values:
 employee_id           0
name                  0
department            0
job_role              0
location              0
current_salary        0
satisfaction_score    0
engage

In [6]:
jobs_path = RAW / "job_descriptions.csv"

if jobs_path.exists():
    jobs = pd.read_csv(jobs_path)
    print("Job Descriptions loaded, shape:", jobs.shape)
else:
    print("Job Descriptions dataset not found!")


Job Descriptions loaded, shape: (10000, 12)


In [7]:
if 'job_text' in jobs.columns:
    jobs['job_text'] = jobs['job_text'].str.lower().str.replace(r'\s+', ' ', regex=True)
    jobs['job_text'] = jobs['job_text'].str.replace(r'[^a-z0-9 ,.]', '', regex=True)

jobs.to_csv(PROCESSED / "job_descriptions_clean.csv", index=False)
print("Job descriptions cleaned and saved.")


Job descriptions cleaned and saved.


In [8]:
reviews_path = RAW / "glassdoor_reviews.csv"

if reviews_path.exists():
    reviews = pd.read_csv(reviews_path)
    print("Glassdoor reviews loaded, shape:", reviews.shape)
else:
    print("Glassdoor reviews dataset not found!")


Glassdoor reviews loaded, shape: (10000, 10)


In [9]:
text_cols = ['review_text', 'pros', 'cons']
for col in text_cols:
    if col in reviews.columns:
        reviews[col] = reviews[col].astype(str).str.lower().str.replace(r'\s+', ' ', regex=True)
        reviews[col] = reviews[col].str.replace(r'[^a-z0-9 ,.]', '', regex=True)

reviews.to_csv(PROCESSED / "glassdoor_reviews_clean.csv", index=False)
print("Glassdoor reviews cleaned and saved.")


Glassdoor reviews cleaned and saved.


In [10]:
resumes_path = RAW / "resumes/Resume/Resume.csv"

if resumes_path.exists():
    resumes = pd.read_csv(resumes_path)
    print("Resumes metadata loaded, shape:", resumes.shape)
else:
    print("Resumes dataset not found!")


Resumes metadata loaded, shape: (2484, 4)


In [11]:
if 'skills' in resumes.columns:
    resumes['skills'] = resumes['skills'].astype(str).str.lower().str.replace(r'\s+', ' ', regex=True)
    resumes['skills'] = resumes['skills'].str.replace(r'[^a-z0-9 ,]', '', regex=True)

resumes.to_csv(PROCESSED / "resumes_clean.csv", index=False)
print("Resumes metadata cleaned and saved.")


Resumes metadata cleaned and saved.


In [12]:
hrms_json_path = RAW / "hrms_synth.json"

if hrms_json_path.exists():
    with open(hrms_json_path, "r") as f:
        hrms_json = json.load(f)
    print("HRMS JSON loaded, total records:", len(hrms_json))

    # Flatten
    hrms_flat = pd.json_normalize(hrms_json)
    print("Flattened HRMS shape:", hrms_flat.shape)
    hrms_flat.to_csv(PROCESSED / "hrms_json_flattened.csv", index=False)
    print("HRMS JSON flattened and saved.")
else:
    print("HRMS JSON dataset not found!")


HRMS JSON loaded, total records: 10000
Flattened HRMS shape: (10000, 26)
HRMS JSON flattened and saved.


In [13]:
print("All datasets loaded, cleaned, sanity-checked, and saved in:", PROCESSED)


All datasets loaded, cleaned, sanity-checked, and saved in: C:\Users\abanu\Documents\T-IQ\data\processed
