In [10]:
import pandas as pd
df = pd.read_csv('../fake_job_postings.csv')


0                                         Marketing Intern
1                Customer Service - Cloud Video Production
2                  Commissioning Machinery Assistant (CMA)
3                        Account Executive - Washington DC
4                                      Bill Review Manager
                               ...                        
17875                     Account Director - Distribution 
17876                                   Payroll Accountant
17877    Project Cost Control Staff Engineer - Cost Con...
17878                                     Graphic Designer
17879                           Web Application Developers
Name: title, Length: 17880, dtype: object

In [76]:
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
df

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [3]:
# Check for null values
grouped_null_percentage = df.groupby('fraudulent').apply(lambda x: x.isnull().mean() * 100)
grouped_null_percentage = grouped_null_percentage.T.round(2)
print(grouped_null_percentage)


fraudulent               0      1
job_id                0.00   0.00
title                 0.00   0.00
location              1.92   2.19
department           64.75  61.32
salary_range         84.45  74.25
company_profile      15.99  67.78
description           0.00   0.12
requirements         14.94  17.78
benefits             40.25  42.03
telecommuting         0.00   0.00
has_company_logo      0.00   0.00
has_questions         0.00   0.00
employment_type      18.98  27.83
required_experience  38.88  50.23
required_education   44.99  52.08
industry             27.20  31.76
function             35.96  38.91
fraudulent            0.00   0.00


In [4]:
# Deduplicate based on all columns except 'job_id'
df_dedup = df.drop_duplicates(subset=[col for col in df.columns if col != 'job_id'])
print(f"Original shape: {df.shape}, Deduplicated shape: {df_dedup.shape}")

Original shape: (17880, 18), Deduplicated shape: (17599, 18)


In [26]:
# Columns to consider for identifying duplicates (ignore 'job_id' and 'fraudulent')
cols_to_check = df.columns.difference(['job_id', 'fraudulent'])

# Identify duplicates based on all columns except 'job_id' and 'fraudulent'
dup_mask = df.duplicated(subset=cols_to_check, keep=False)
df_duplicates = df[dup_mask].sort_values(by=list(cols_to_check))
print(f"Found {df_duplicates.shape[0]} duplicated rows (ignoring job_id and fraudulent).")

# Check if duplicated rows have consistent 'fraudulent' labels
inconsistent_labels = df.groupby(list(cols_to_check))['fraudulent'].nunique()
inconsistent_count = (inconsistent_labels > 1).sum()
print(f"Number of groups with inconsistent 'fraudulent' labels: {inconsistent_count}")


Found 527 duplicated rows (ignoring job_id and fraudulent).
Number of groups with inconsistent 'fraudulent' labels: 0


Presense of duplicated rows ignoring job_id & target label, duplicated rows all have the same target label

## Values for each row

### Identified columns to compare for uniqueness
- 'title'
- 'description'
- 'requirements'
- 'employment_type'
- 'location'

In [None]:
# Get value counts for each column
for column in df.columns:
    print(f"\n{column}:")
    print(df[column].value_counts().head())
    print(f"Null values: {df[column].isnull().sum()}")
    print(f"Unique values: {df[column].nunique()}")
    print("-" * 50)
    

In [None]:
# Further analyze interested columns
# show full value_counts (not truncated)
pd.set_option('display.max_rows', None)

for col in ['title', 'employment_type', 'location']:
    print(f"\n--- {col} ---")
    display(df[col].value_counts(dropna=False))

# restore default
pd.reset_option('display.max_rows')

# Show unique values for each column to identify "unknown" types
cols_to_check = ['required_experience', 'required_education', 'industry', 'function']

for col in cols_to_check:
    print(f"\nUnique values in '{col}':")
    print(df[col].value_counts(dropna=False))

## Data Cleaning

NA values -> 'unknown'
Certain columns have other "NA" like values: eg. Not Applicable, NaN, Unspecified. They have all been replaced with 'unknown'

### Columns
location, employment_type, title, description and requirements are columns which are to be compared to find duplicates.

In [113]:
# Cleaning Employment Type for deduplication
df_cleaning = df.copy()

# Clean columns by replacing nulls and unspecified values with 'unknown'
cols_to_clean = ['required_experience', 'required_education', 'industry', 'function']
unspecified_values = ['Not Applicable','NaN','not applicable', 'Unspecified', 'Other','Others','none', 'na', 'n/a', '', None]

for col in cols_to_clean:
    df_cleaning[col] = df_cleaning[col].replace(unspecified_values, 'unknown')
    df_cleaning[col] = df_cleaning[col].fillna('unknown')

for col in df_cleaning.columns:
    df_cleaning[col] = df_cleaning[col].fillna('unknown')

def simplify_employment_type(x):
    if pd.isna(x):
        return 'unknown'
    x = x.strip().lower()
    if x in ['full-time', 'part-time']:
        return x  # keep these separate
    elif x in ['contract', 'temporary']:
        return 'non-permanent'
    elif x in ['other', 'unknown', '']:
        return 'unknown'
    else:
        return 'unknown'
df_cleaning['employment_type_clean'] = df_cleaning['employment_type'].apply(simplify_employment_type)

def comparison_key(row):
    emp = None if row['employment_type_clean'] == 'unknown' else row['employment_type_clean']
    return (row['location'],row['title'], row['description'], row['requirements'], emp)

df_cleaning['dedup_key'] = df_cleaning.apply(comparison_key, axis=1)
df_nodup = df_cleaning.drop_duplicates(subset=['dedup_key'])

df_nodup



Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,employment_type_clean,dedup_key
0,1,Marketing Intern,"US, NY, New York",Marketing,unknown,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,unknown,0,1,0,Other,Internship,unknown,unknown,Marketing,0,unknown,"(US, NY, New York, Marketing Intern, Food52, a..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,unknown,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,unknown,unknown,Marketing and Advertising,Customer Service,0,full-time,"(NZ, , Auckland, Customer Service - Cloud Vide..."
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",unknown,unknown,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,unknown,0,1,0,unknown,unknown,unknown,unknown,unknown,0,unknown,"(US, IA, Wever, Commissioning Machinery Assist..."
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,unknown,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,full-time,"(US, DC, Washington, Account Executive - Washi..."
4,5,Bill Review Manager,"US, FL, Fort Worth",unknown,unknown,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,full-time,"(US, FL, Fort Worth, Bill Review Manager, JOB ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,unknown,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,unknown,Computer Software,Sales,0,full-time,"(CA, ON, Toronto, Account Director - Distribut..."
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,unknown,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,full-time,"(US, PA, Philadelphia, Payroll Accountant, The..."
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",unknown,unknown,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,unknown,0,0,0,Full-time,unknown,unknown,unknown,unknown,0,full-time,"(US, TX, Houston, Project Cost Control Staff E..."
17878,17879,Graphic Designer,"NG, LA, Lagos",unknown,unknown,unknown,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,unknown,Professional,Graphic Design,Design,0,non-permanent,"(NG, LA, Lagos, Graphic Designer, Nemsia Studi..."


In [79]:
dup_cols = ['description', 'requirements', 'company_profile', 'required_experience']
dup_mask = df.duplicated(subset=dup_cols, keep='first')
df_cleaned = df.loc[~dup_mask].reset_index(drop=True)
print(f"Original shape: {df.shape}, Cleaned shape: {df_cleaned.shape}")

Original shape: (17880, 18), Cleaned shape: (15423, 18)


In [6]:
# Binary check
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
df_binary = df[binary_cols]
df_binary = df_binary.apply(lambda x: x.map({'Yes': 1, 'No': 0, True: 1, False: 0}) 
                            if x.dtypes == 'object' else x)
corr_matrix = df_binary.corr()
print(corr_matrix)

                  telecommuting  has_company_logo  has_questions  fraudulent
telecommuting          1.000000         -0.019836       0.020345    0.034523
has_company_logo      -0.019836          1.000000       0.233932   -0.261971
has_questions          0.020345          0.233932       1.000000   -0.091627
fraudulent             0.034523         -0.261971      -0.091627    1.000000


Logo & Fraudulent inverse relationship

In [7]:
# perform TD-IDF with job title and compare with fraudulent column
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['title'].fillna(''))
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df['fraudulent'] = df['fraudulent'].values
correlations = tfidf_df.corr()['fraudulent'].sort_values(ascending=False)

In [8]:
# most common words in title associated with fraudulent job postings
print(correlations)

fraudulent    1.000000
earn          0.183713
payroll       0.183350
entry         0.182795
daily         0.181072
                ...   
designer     -0.036949
abroad       -0.044762
teacher      -0.047209
english      -0.047576
developer    -0.058592
Name: fraudulent, Length: 4567, dtype: float64
