<a id='import'></a>
## 1. Import Libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

# Utilities
import warnings
import sys
import os

# Add src to path
sys.path.append('../src')

# Import custom modules
from data_preprocessing import clean_text, handle_missing_values, clean_dataframe

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

Libraries imported successfully!


<a id='load'></a>
## 2. Load Original Data

In [2]:
# Load datasets
train_df = pd.read_csv('../data/fake_job_postings_train.csv')
test_df = pd.read_csv('../data/fake_job_postings_test.csv')

print("Original Data Shapes:")
print(f"Training: {train_df.shape}")
print(f"Test: {test_df.shape}")
print("\n" + "="*80)

# Display first few rows
print("\nSample of raw data:")
train_df.head(3)

Original Data Shapes:
Training: (9999, 18)
Test: (7881, 17)


Sample of raw data:


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,00100eff19844c0190df4c5e11f4c066,Office Manager / HR,"GB, LND,",,,,"Based in the heart of the City, this exception...",You will have previous experience in running a...,Excellent Salaries Company Benefit PackagesInt...,0,1,0,Full-time,,Bachelor's Degree,Retail,,0
1,0012cf527c5740999d1c6545dce5cd95,Director of Account Management,"US, CA, San Francisco",,,,The Director of Account Management brings stro...,8+ years experience in an account management a...,,0,0,1,,,,,,0
2,0013e2bf9f6548c5b214853f9134b005,Brand Manager [Contract position],"US, , Boise",Account team,,Since 1978Our goal has been to create engaging...,Drake Cooper Brand Managers help clients grow ...,Education: BS/BA degree in advertising / mar...,ABOUT DRAKE COOPERWe are a Northwest U.S. adve...,0,1,0,Contract,Associate,Bachelor's Degree,Marketing and Advertising,Advertising,0


<a id='text-clean'></a>
## 3. Text Cleaning

Clean text columns by:
- Removing HTML tags
- Removing URLs
- Removing extra whitespace
- Converting to lowercase

In [3]:
# Define text columns to clean
text_columns = ['title', 'location', 'department', 'salary_range', 
                'company_profile', 'description', 'requirements', 'benefits']

print("Cleaning text columns...\n")

# Clean training data
for col in text_columns:
    print(f"Cleaning: {col}")
    train_df[col] = train_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

print("\nText cleaning completed!")

Cleaning text columns...

Cleaning: title
Cleaning: location
Cleaning: department
Cleaning: salary_range
Cleaning: company_profile
Cleaning: description
Cleaning: requirements
Cleaning: benefits

Text cleaning completed!


In [4]:
# Verify cleaning - show before/after example
print("Example of cleaned text:")
print("="*80)
sample_idx = 0
print(f"\nTitle: {train_df['title'].iloc[sample_idx]}")
print(f"\nDescription (first 200 chars):\n{train_df['description'].iloc[sample_idx][:200]}...")

Example of cleaned text:

Title: Office Manager / HR

Description (first 200 chars):
Based in the heart of the City, this exceptional opportunity as Office Manager / HR in a fast-growing tech start-up is a challenging and varied position requiring a highly motivated professional, idea...


<a id='missing'></a>
## 4. Handle Missing Values

Strategy:
- Text columns: Fill with empty string
- Categorical columns: Fill with 'Unknown'
- Binary columns: Fill with 0

In [5]:
# Check missing values before handling
print("Missing values BEFORE handling:")
print("="*80)
missing_before = train_df.isnull().sum()
print(missing_before[missing_before > 0])
print(f"\nTotal missing values: {train_df.isnull().sum().sum():,}")

Missing values BEFORE handling:
employment_type        1992
required_experience    3960
required_education     4548
industry               2781
function               3654
dtype: int64

Total missing values: 16,935


In [6]:
# Handle missing values
print("Handling missing values...\n")

train_df = handle_missing_values(train_df)
test_df = handle_missing_values(test_df)

print("\nMissing values handled!")

Handling missing values...


Missing values handled!


In [7]:
# Verify missing values after handling
print("Missing values AFTER handling:")
print("="*80)
missing_after = train_df.isnull().sum()
print(f"Total missing values: {train_df.isnull().sum().sum()}")

if train_df.isnull().sum().sum() == 0:
    print("\nAll missing values successfully handled!")
else:
    print("\nSome missing values remain:")
    print(missing_after[missing_after > 0])

Missing values AFTER handling:
Total missing values: 0

All missing values successfully handled!


<a id='validate'></a>
## 5. Data Validation

In [8]:
# Validate data types
print("Data Types:")
print("="*80)
print(train_df.dtypes)
print("\n" + "="*80)

# Check for any remaining issues
print("\nData Quality Checks:")
print(f"✓ Training set shape: {train_df.shape}")
print(f"✓ Test set shape: {test_df.shape}")
print(f"✓ Missing values in train: {train_df.isnull().sum().sum()}")
print(f"✓ Missing values in test: {test_df.isnull().sum().sum()}")
print(f"✓ Duplicate rows in train: {train_df.duplicated().sum()}")
print(f"✓ Duplicate rows in test: {test_df.duplicated().sum()}")

Data Types:
job_id                 object
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object


Data Quality Checks:
✓ Training set shape: (9999, 18)
✓ Test set shape: (7881, 17)
✓ Missing values in train: 0


✓ Missing values in test: 0
✓ Duplicate rows in train: 0
✓ Duplicate rows in test: 0


In [9]:
# Display cleaned data sample
print("Cleaned Data Sample:")
print("="*80)
train_df.head()

Cleaned Data Sample:


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,00100eff19844c0190df4c5e11f4c066,Office Manager / HR,"GB, LND,",,,,"Based in the heart of the City, this exception...",You will have previous experience in running a...,Excellent Salaries Company Benefit PackagesInt...,0,1,0,Full-time,Unknown,Bachelor's Degree,Retail,Unknown,0
1,0012cf527c5740999d1c6545dce5cd95,Director of Account Management,"US, CA, San Francisco",,,,The Director of Account Management brings stro...,8+ years experience in an account management a...,,0,0,1,Unknown,Unknown,Unknown,Unknown,Unknown,0
2,0013e2bf9f6548c5b214853f9134b005,Brand Manager [Contract position],"US, , Boise",Account team,,Since 1978Our goal has been to create engaging...,Drake Cooper Brand Managers help clients grow ...,Education: BS/BA degree in advertising / marke...,ABOUT DRAKE COOPERWe are a Northwest U.S. adve...,0,1,0,Contract,Associate,Bachelor's Degree,Marketing and Advertising,Advertising,0
3,001493eaf7684ba693d9c1294e12787b,Web Developer,"GR, ,",,,We are a technology focused company with an es...,This position is for a talented PHP/MYSQL/JQue...,"Solid object-oriented design, programming, and...",,0,1,0,Full-time,Mid-Senior level,Unknown,E-Learning,Unknown,0
4,0014e3ba40af422981a8d30e96a34cb1,Performance Improvement Director,"US, ,",,,,Oversees implementation and operation of Perfo...,Quality-oriented consulting company with a foc...,Full-time position.Quality benefits package of...,0,0,0,Full-time,Director,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


<a id='save'></a>
## 6. Save Cleaned Data

In [10]:
# Create processed data directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save cleaned datasets
train_df.to_csv('../data/processed/train_cleaned.csv', index=False)
test_df.to_csv('../data/processed/test_cleaned.csv', index=False)

print("Cleaned datasets saved successfully!")
print("\nSaved files:")
print("  - ../data/processed/train_cleaned.csv")
print("  - ../data/processed/test_cleaned.csv")

Cleaned datasets saved successfully!

Saved files:
  - ../data/processed/train_cleaned.csv
  - ../data/processed/test_cleaned.csv


---

## Summary

### Completed Tasks:
✅ Loaded raw datasets
✅ Cleaned text data (HTML removal, URL removal, whitespace normalization)
✅ Handled missing values appropriately
✅ Validated data quality
✅ Saved cleaned datasets

### Next Steps:
- Feature Engineering (Notebook 03)
- Create text features (TF-IDF, word embeddings)
- Encode categorical variables
- Create additional engineered features