In [1]:
import pandas as pd
df = pd.read_csv('../fake_job_postings.csv')

In [2]:
binary_cols = [col for col in df.columns if df[col].nunique() == 2]

In [2]:
# Check for null values
grouped_null_percentage = df.groupby('fraudulent').apply(lambda x: x.isnull().mean() * 100)
grouped_null_percentage = grouped_null_percentage.T.round(2)
print(grouped_null_percentage)


fraudulent               0      1
job_id                0.00   0.00
title                 0.00   0.00
location              1.92   2.19
department           64.75  61.32
salary_range         84.45  74.25
company_profile      15.99  67.78
description           0.00   0.12
requirements         14.94  17.78
benefits             40.25  42.03
telecommuting         0.00   0.00
has_company_logo      0.00   0.00
has_questions         0.00   0.00
employment_type      18.98  27.83
required_experience  38.88  50.23
required_education   44.99  52.08
industry             27.20  31.76
function             35.96  38.91
fraudulent            0.00   0.00


In [3]:
# Deduplicate based on all columns except 'job_id'
df_dedup = df.drop_duplicates(subset=[col for col in df.columns if col != 'job_id'])
print(f"Original shape: {df.shape}, Deduplicated shape: {df_dedup.shape}")

Original shape: (17880, 18), Deduplicated shape: (17599, 18)


In [4]:
# Columns to consider for identifying duplicates (ignore 'job_id' and 'fraudulent')
cols_to_check = df.columns.difference(['job_id', 'fraudulent'])

# Identify duplicates based on all columns except 'job_id' and 'fraudulent'
dup_mask = df.duplicated(subset=cols_to_check, keep=False)
df_duplicates = df[dup_mask].sort_values(by=list(cols_to_check))
print(f"Found {df_duplicates.shape[0]} duplicated rows (ignoring job_id and fraudulent).")

# Check if duplicated rows have consistent 'fraudulent' labels
inconsistent_labels = df_duplicates.groupby(list(cols_to_check))['fraudulent'].nunique()
inconsistent_count = (inconsistent_labels > 1).sum()
print(f"Number of groups with inconsistent 'fraudulent' labels: {inconsistent_count}")


Found 527 duplicated rows (ignoring job_id and fraudulent).
Number of groups with inconsistent 'fraudulent' labels: 0


Presense of duplicated rows ignoring job_id & target label, duplicated rows all have the same target label

fraudulent               0      1
job_id                0.00   0.00
title                 0.00   0.00
location              1.92   2.19
department           64.75  61.32
salary_range         84.45  74.25
company_profile      15.99  67.78
description           0.00   0.12
requirements         14.94  17.78
benefits             40.25  42.03
telecommuting         0.00   0.00
has_company_logo      0.00   0.00
has_questions         0.00   0.00
employment_type      18.98  27.83
required_experience  38.88  50.23
required_education   44.99  52.08
industry             27.20  31.76
function             35.96  38.91
fraudulent            0.00   0.00


In [6]:
# Binary check
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
df_binary = df[binary_cols]
df_binary = df_binary.apply(lambda x: x.map({'Yes': 1, 'No': 0, True: 1, False: 0}) 
                            if x.dtypes == 'object' else x)
corr_matrix = df_binary.corr()
print(corr_matrix)

                  telecommuting  has_company_logo  has_questions  fraudulent
telecommuting          1.000000         -0.019836       0.020345    0.034523
has_company_logo      -0.019836          1.000000       0.233932   -0.261971
has_questions          0.020345          0.233932       1.000000   -0.091627
fraudulent             0.034523         -0.261971      -0.091627    1.000000


Logo & Fraudulent inverse relationship

In [3]:
# perform TD-IDF with job title and compare with fraudulent column
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['title'].fillna(''))
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df['fraudulent'] = df['fraudulent'].values
correlations = tfidf_df.corr()['fraudulent'].sort_values(ascending=False)

In [4]:
# most common words in title associated with fraudulent job postings
print(correlations)

fraudulent    1.000000
earn          0.183713
payroll       0.183350
entry         0.182795
daily         0.181072
                ...   
designer     -0.036949
abroad       -0.044762
teacher      -0.047209
english      -0.047576
developer    -0.058592
Name: fraudulent, Length: 4567, dtype: float64
