In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import numpy as np
import pandas as pd

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer



In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
pd.set_option('display.max_colwidth', None)

In [8]:
train_df = pd.read_csv('/content/drive/MyDrive/Anvesh_June_Hackathon/train_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Anvesh_June_Hackathon/test_data.csv')

# EDA and Data Wrangling

In [9]:
train_df.shape

(14304, 18)

In [10]:
test_df.shape

(3576, 17)

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14304 entries, 0 to 14303
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               14304 non-null  int64 
 1   title                14304 non-null  object
 2   location             14024 non-null  object
 3   department           5029 non-null   object
 4   salary_range         2283 non-null   object
 5   company_profile      11632 non-null  object
 6   description          14303 non-null  object
 7   requirements         12172 non-null  object
 8   benefits             8501 non-null   object
 9   telecommuting        14304 non-null  int64 
 10  has_company_logo     14304 non-null  int64 
 11  has_questions        14304 non-null  int64 
 12  employment_type      11547 non-null  object
 13  required_experience  8629 non-null   object
 14  required_education   7805 non-null   object
 15  industry             10378 non-null  object
 16  func

In [12]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3576 entries, 0 to 3575
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               3576 non-null   int64 
 1   title                3576 non-null   object
 2   location             3510 non-null   object
 3   department           1304 non-null   object
 4   salary_range         585 non-null    object
 5   company_profile      2940 non-null   object
 6   description          3576 non-null   object
 7   requirements         3012 non-null   object
 8   benefits             2167 non-null   object
 9   telecommuting        3576 non-null   int64 
 10  has_company_logo     3576 non-null   int64 
 11  has_questions        3576 non-null   int64 
 12  employment_type      2862 non-null   object
 13  required_experience  2201 non-null   object
 14  required_education   1970 non-null   object
 15  industry             2599 non-null   object
 16  functi

In [13]:
train_df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [14]:
train_df.isna().sum()

Unnamed: 0,0
job_id,0
title,0
location,280
department,9275
salary_range,12021
company_profile,2672
description,1
requirements,2132
benefits,5803
telecommuting,0


missing data might itself be a signal of fraud. So, I am going to preserve the missingness.




In [15]:
# Text columns — fill with empty string
text_cols = ['department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits']
train_df[text_cols] = train_df[text_cols].fillna('')

# Categorical columns — fill with 'missing'
cat_cols = ['location', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']
train_df[cat_cols] = train_df[cat_cols].fillna('missing')

# Optional: Add missing indicators (useful for tree-based models like XGBoost)
for col in text_cols + cat_cols:
    train_df[f'{col}_missing'] = train_df[col].apply(lambda x: 1 if x in ['', 'missing'] else 0)


In [16]:
train_df.isna().sum()


Unnamed: 0,0
job_id,0
title,0
location,0
department,0
salary_range,0
company_profile,0
description,0
requirements,0
benefits,0
telecommuting,0


In [17]:
train_df.shape

(14304, 30)

In [18]:
train_df.head(2)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,company_profile_missing,description_missing,requirements_missing,benefits_missing,location_missing,employment_type_missing,required_experience_missing,required_education_missing,industry_missing,function_missing
0,7531,Contact Center Representatives,"US, VA, Virginia Beach",,,"Tidewater Finance Co. was established in 1992 for the initial purpose of purchasing, and servicing retail installment contracts. There are two divisions: Tidewater Credit Services, providing indirect consumer retail finance options and Tidewater Motor Credit, providing indirect consumer auto financing. We remain committed to offering a partnership with the dealers and consumers to create a WIN-WIN-WIN situation. Our success relies solely on the success of our dealers and our consumers.Full time positions include the following benefits:40 vacation hours after 6 months of employment, 80 vacation hours after 1 year of employment6 paid holidays as well as an anniversary holiday benefitPaid personal and sick leave after 90 days of employmentFull benefits to include health, dental, life and disability insuranceA 401k plan with a company match after 6 months of employment based upon a quarterly entry dateIncentive bonuses for individual and team goals (certain positions)Bilingual Spanish eligible for differential pay","Tidewater Finance Company, located in Virginia Beach, VA has full and part-time positions available for Contact Center Representatives. We provide diverse lending solutions for our dealer network while promoting employee integrity, teamwork and an uncompromised level of customer service. The position requires the following qualifications: A minimum of 1 year in a Call Center environment or equivalent customer service experience Collections experience a plus!Ability to communicate effectively and professionally both verbally and in writingProficient typing skillsMust be able to work late nights and Saturdays as scheduled by ManagementAutomatic dialer experience a plusStrong negotiation skillsBilingual in Spanish is a plus! Primary responsibilities include, but are not limited to the following: Make and receive calls through automated dialerTake payments on past due accountsHandle customer service related issuesProcess related paperworkAdherence to company policies and procedures in addition to compliance of state and federal regulations We offer a competitive salary based on experience and a comprehensive benefits package. Interested candidates may apply in person at:6520 Indian River RoadVirginia Beach, VA 23464 If you prefer you may submit your resume via e-mail to #EMAIL_169ac3804e2da6e0514e5ef76c29f157f41d80451b486889d9aa#PHONE_4dbd33c1dede3cec472e02df8f201e27aa330a9a201578720111c840de9d8117## or fax your resume to the Human Resources Department at #PHONE_8f86665c8a76d925f761287bb38d6bb5f440845f2a5fa712361f255943a8b21b#.Tidewater Finance Company is an equal opportunity employer in all aspects of employment without regard to race, age, sex, marital status, religion, disability, military status or any other characteristic or status protected by law. Tidewater Finance Company includes Tidewater Motor Credit and Tidewater Credit Services.",The position requires the following qualifications: A minimum of 1 year in a Call Center environment or equivalent customer service experience Collections experience a plus!Ability to communicate effectively and professionally both verbally and in writingProficient typing skillsMust be able to work late nights and Saturdays as scheduled by ManagementAutomatic dialer experience a plusStrong negotiation skillsBilingual in Spanish is a plus!,"Our company offers a competitive salary plus BONUSES as well as a comprehensive benefits package to our full-time employees including:40 vacation hours after 6 months of employment, 80 vacation hours after 1 year of employment6 paid holidays as well as an anniversary holiday benefitPaid personal and sick leave after 90 days of employmentHealth, dental, life, and disability insurance as well as AFLAC supplemental insuranceA 401K plan with a company match after six months of employment, however, we have quarterly enrollment periods.",0,...,0,0,0,0,0,0,0,0,0,0
1,130,Customer Service Associate,"US, TX, Dallas",,,"Novitex Enterprise Solutions, formerly Pitney Bowes Management Services, delivers innovative document and communications management solutions that help companies around the world drive business process efficiencies, increase productivity, reduce costs and improve customer satisfaction. For almost 30 years, clients have turned to us to integrate and optimize their enterprise-wide business processes to empower employees, increase productivity and maximize results. As a trusted partner, we continually focus on delivering secure, technology-enabled document and communications solutions that improve our clients' work processes, enhance their customer interactions and drive growth.","The Customer Service Associate will be based in Dallas, TX. The right candidate will be an integral part of our talented team, supporting our continued growth.Responsibilities:Maintain highest level of customer care while demonstrating a friendly and cooperative attitudeEffectively address and resolve client and customer concerns and/or complaintsDemonstrate flexibility in satisfying customer in high demand environmentEnsure that deadlines are met; prioritize workload assignmentsProvide set-up of audio visual equipment in conference rooms as needed prior to scheduled meeting times and removal of audio visual equipment after meetings are completed.Assist clients as needed with guidance on operating equipment provided by conference management.Conduct quarterly equipment inventory and provide a copy of inventory to designated representative as requested. Arrange for audio visual equipment repair.Post daily conference room schedules in designated locations as information for attendees.Distribute keys to conference rooms and audio visual equipment.Notify appropriate entity of any needed maintenance after each inspection.Refer catering to designated food services supplier as necessary.Provide and maintain appropriate records of all charges to Company, supply usage, repair information, rental information, etc.Check that meeting rooms are set-up correctly, with the proper equipment and amenities, prior to the start of meetingsOperate and manage the conference rooms in a manner to meet or exceed the standards required by the client.Maintaining all logs and reporting documentation with attention to detailProvide special project and administrative support on an ad hoc basis.Participate in Novitex and customer mandated trainingParticipate in cross-trainingAdhering to all safety proceduresConsistently adhering to business practice guidelines and policiesProvide back up support to mail services functions as neededTake direction from team leader or service delivery manager","QualificationsMinimum of 1 year customer service related experience required.Minimum of 6 months conference room related work experienceExceptional Customer Service Skills &amp; ProfessionalismProvide a professional manner and appearance when on duty.Knowledge of audio/visual equipment requiredFlexible schedule / Stay as needed to cover meeting needsOvertime &amp; Weekend Coverage as neededStrong planning, prioritization and organizational skillsAbility to multi task and manage multiple priorities and deadlines is criticalSelf-motivated and possess a strong sense of responsibilityStrong attention to detail and follow throughExcellent communication skills both verbal and writtenComputer proficiency in email environments, Microsoft Office Suite or similar programsWork effectively with a diverse range of individuals and groupsWillingness to cross-training for other job functionsAbility to effectively work individually or within a team in a fast paced environmentAbility to lift and/or move items up to 50 pounds or maximum allowed by current State Law with or without accommodationsAbility to sit, stand and/or walk for long periods of time with or without reasonable accommodationAbility to meet employer's attendance policySubmit to a pre-employment drug screening and criminal background checkHigh school diploma or equivalent (GED) required",,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
train_df['description'][:5]

Unnamed: 0,description
0,"Tidewater Finance Company, located in Virginia Beach, VA has full and part-time positions available for Contact Center Representatives. We provide diverse lending solutions for our dealer network while promoting employee integrity, teamwork and an uncompromised level of customer service. The position requires the following qualifications: A minimum of 1 year in a Call Center environment or equivalent customer service experience Collections experience a plus!Ability to communicate effectively and professionally both verbally and in writingProficient typing skillsMust be able to work late nights and Saturdays as scheduled by ManagementAutomatic dialer experience a plusStrong negotiation skillsBilingual in Spanish is a plus! Primary responsibilities include, but are not limited to the following: Make and receive calls through automated dialerTake payments on past due accountsHandle customer service related issuesProcess related paperworkAdherence to company policies and procedures in addition to compliance of state and federal regulations We offer a competitive salary based on experience and a comprehensive benefits package. Interested candidates may apply in person at:6520 Indian River RoadVirginia Beach, VA 23464 If you prefer you may submit your resume via e-mail to #EMAIL_169ac3804e2da6e0514e5ef76c29f157f41d80451b486889d9aa#PHONE_4dbd33c1dede3cec472e02df8f201e27aa330a9a201578720111c840de9d8117## or fax your resume to the Human Resources Department at #PHONE_8f86665c8a76d925f761287bb38d6bb5f440845f2a5fa712361f255943a8b21b#.Tidewater Finance Company is an equal opportunity employer in all aspects of employment without regard to race, age, sex, marital status, religion, disability, military status or any other characteristic or status protected by law. Tidewater Finance Company includes Tidewater Motor Credit and Tidewater Credit Services."
1,"The Customer Service Associate will be based in Dallas, TX. The right candidate will be an integral part of our talented team, supporting our continued growth.Responsibilities:Maintain highest level of customer care while demonstrating a friendly and cooperative attitudeEffectively address and resolve client and customer concerns and/or complaintsDemonstrate flexibility in satisfying customer in high demand environmentEnsure that deadlines are met; prioritize workload assignmentsProvide set-up of audio visual equipment in conference rooms as needed prior to scheduled meeting times and removal of audio visual equipment after meetings are completed.Assist clients as needed with guidance on operating equipment provided by conference management.Conduct quarterly equipment inventory and provide a copy of inventory to designated representative as requested. Arrange for audio visual equipment repair.Post daily conference room schedules in designated locations as information for attendees.Distribute keys to conference rooms and audio visual equipment.Notify appropriate entity of any needed maintenance after each inspection.Refer catering to designated food services supplier as necessary.Provide and maintain appropriate records of all charges to Company, supply usage, repair information, rental information, etc.Check that meeting rooms are set-up correctly, with the proper equipment and amenities, prior to the start of meetingsOperate and manage the conference rooms in a manner to meet or exceed the standards required by the client.Maintaining all logs and reporting documentation with attention to detailProvide special project and administrative support on an ad hoc basis.Participate in Novitex and customer mandated trainingParticipate in cross-trainingAdhering to all safety proceduresConsistently adhering to business practice guidelines and policiesProvide back up support to mail services functions as neededTake direction from team leader or service delivery manager"
2,"We are looking for a dedicated and passionate Software Test Analyst who is a team players with high personal standards and have a strong eye for detail, to join our team in Auckland.As Test Analyst, you will be responsible for planning and implementing test-scripts for our automated and manual test tools, as well as the continuous improvement of testing practices within the team. You will be working with our delivery teams and clients to figure out test-related requirements and translating them into our testing processes.We want those who:have at least 4+ years experience with functional testing in an Agile environmenthave a proven testing background with experience in web application developmenthave experience in developing test plans (manual and automated) for functional testinghave experience in both blackbox and whitebox testinghave excellent attention to detail and a quality driven passion for good softwarehave the ability to communicate effectively both with clients and colleagues alikecan deal with ambiguity and effectively cope with changeare quick learnershave a strong customer focus and are dedicated to meeting the expectations of internal and external customersIf this is something that interests you, we’d love to hear from you. Please apply below."
3,"As a Sales Representative, you will provide assistance to our customers as they purchase the materials and tools they need for a wide variety of roofing, siding, and window/door replacement projects. From the moment you greet customers until their sales have been finalized, you will provide them with the best in customer service and exterior building supply expertise.Your specific duties as a Sales Representative may include:Determining customers’ needs and recommending appropriate products and solutionsFollowing ABC’s product/supply checklist for each customer’s specific job and upselling additional products and supplies for that jobAnswering customer questions and offering product adviceOrdering products from other ABC Supply branches when necessaryAccepting payment and applying it to the appropriate customer accountArranging with the warehouse for customer product pickupFollowing-up on each delivery to ensure that shipment arrived on time with all items accounted forBalancing out cash drawers and preparing bank deposits at the end of each dayArranging product displays and layouts to maximize effectivenessReordering products to keep the store and warehouse shelves well stockedGiving out comment cards to customersAddressing and resolving customer complaints when necessary"
4,"MeUndies is a lifestyle brand that is transforming the way people perceive and purchase their basics. As a vertically integrated, direct-to-consumer company we do it all: from design and manufacturing, to marketing and web design &amp; development, to fulfillment and shipping, MeUndies delivers The World’s Most Comfortable Basics with a relentless emphasis on quality and service. Our commitment is to deliver a memorable customer experience, and our Customer Experience team spearheads this effort by overseeing strategy, technology implementation, and execution. Highlights:Founded by entrepreneurial Founders passionate about transforming ecommerce.Has experienced double digit year-over-year growth the last several years.Raised over $2M in funding. THE OPPORTUNITYOur commitment is to share MeUndies with the rest of the world, and our Marketing team spearheads this effort by crafting and telling our brand story through all relevant channels. We’re looking for a rockstar Content Marketing/SEO Manager to create, execute, and analyze our content marketing strategy.Specifically…Oversee and successfully execute MeUndies Content/SEO strategy.Grow MeUndies organic traffic through white-hat techniques.Analyze and track organic search results and movement across all branded and category terms.Collaborate with the Product Development, Web Development, Creative, and Content teams to create keyword-rich content that is authentic and compelling.Build content partnerships with external creators and communities to create positive earned media and increase content sharing.Analyze data to diagnose performance issues, uncover new SEO strategies, and optimize MeUndies content marketing strategy."


In [20]:
print(train_df.iloc[1]['description'])


The Customer Service Associate will be based in Dallas, TX. The right candidate will be an integral part of our talented team, supporting our continued growth.Responsibilities:Maintain highest level of customer care while demonstrating a friendly and cooperative attitudeEffectively address and resolve client and customer concerns and/or complaintsDemonstrate flexibility in satisfying customer in high demand environmentEnsure that deadlines are met; prioritize workload assignmentsProvide set-up of audio visual equipment in conference rooms as needed prior to scheduled meeting times and removal of audio visual equipment after meetings are completed.Assist clients as needed with guidance on operating equipment provided by conference management.Conduct quarterly equipment inventory and provide a copy of inventory to designated representative as requested. Arrange for audio visual equipment repair.Post daily conference room schedules in designated locations as information for attendees.Dist

In [21]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = tokenizer.tokenize(text)
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)


In [22]:
# train_df['clean_title'] = train_df['title'].fillna('').apply(preprocess)
train_df['clean_description'] = train_df['description'].fillna('').apply(preprocess)
train_df['clean_requirements'] = train_df['requirements'].fillna('').apply(preprocess)


In [23]:
# Add preprocessing for other text fields
train_df['clean_company_profile'] = train_df['company_profile'].fillna('').apply(preprocess)
train_df['clean_benefits'] = train_df['benefits'].fillna('').apply(preprocess)
# train_df['clean_department'] = train_df['department'].fillna('').apply(preprocess)


For making the salary information more meaningful, I am going to split the salary data

In [24]:
def parse_salary(s):
    try:
        low, high = s.split('-')
        return float(low.strip()), float(high.strip())
    except:
        return np.nan, np.nan

salary_split = train_df['salary_range'].fillna('').apply(parse_salary)
train_df['salary_min'] = salary_split.apply(lambda x: x[0])
train_df['salary_max'] = salary_split.apply(lambda x: x[1])


In [25]:
# Replace NaNs in salary_min and salary_max with -1
train_df['salary_min'] = train_df['salary_min'].fillna(-1)
train_df['salary_max'] = train_df['salary_max'].fillna(-1)

In [26]:
train_df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'department_missing', 'salary_range_missing',
       'company_profile_missing', 'description_missing',
       'requirements_missing', 'benefits_missing', 'location_missing',
       'employment_type_missing', 'required_experience_missing',
       'required_education_missing', 'industry_missing', 'function_missing',
       'clean_description', 'clean_requirements', 'clean_company_profile',
       'clean_benefits', 'salary_min', 'salary_max'],
      dtype='object')

To decide what kind of encoding suits better for categorical columns, let me observe the values of such columns....

In [27]:
train_df['employment_type'].unique()

array(['Full-time', 'missing', 'Contract', 'Temporary', 'Part-time',
       'Other'], dtype=object)

In [28]:
train_df['has_company_logo'].unique()

array([1, 0])

In [29]:
train_df['industry'].unique()

array(['Financial Services', 'Telecommunications',
       'Information Technology and Services', 'Building Materials',
       'Internet', 'missing', 'Computer Games', 'Consumer Services',
       'Marketing and Advertising', 'Environmental Services',
       'Medical Practice', 'Education Management',
       'Hospital & Health Care', 'Computer Software', 'Executive Office',
       'Events Services', 'Mechanical or Industrial Engineering',
       'Retail', 'Apparel & Fashion', 'Health, Wellness and Fitness',
       'Cosmetics', 'E-Learning', 'Food & Beverages',
       'Electrical/Electronic Manufacturing', 'Real Estate', 'Design',
       'Online Media', 'Facilities Services', 'Aviation & Aerospace',
       'Legal Services', 'Hospitality', 'Banking', 'Automotive',
       'Chemicals', 'Broadcast Media', 'Oil & Energy',
       'Computer Hardware', 'Computer Networking',
       'Nonprofit Organization Management', 'Entertainment',
       'Human Resources', 'Venture Capital & Private Equity',


In [30]:
train_df['function'].unique()

array(['Customer Service', 'missing', 'Sales', 'Marketing', 'Management',
       'Information Technology', 'Administrative', 'Consulting',
       'Education', 'Health Care Provider', 'Accounting/Auditing',
       'Legal', 'Engineering', 'Design', 'Business Development',
       'Project Management', 'Business Analyst', 'Other',
       'Human Resources', 'Writing/Editing', 'Data Analyst', 'Finance',
       'Product Management', 'Research', 'Manufacturing',
       'Public Relations', 'Training', 'Art/Creative',
       'Quality Assurance', 'Advertising', 'Distribution',
       'Strategy/Planning', 'General Business', 'Production',
       'Financial Analyst', 'Supply Chain', 'Science', 'Purchasing'],
      dtype=object)

In [31]:
train_df['required_education'].unique()

array(['Unspecified', 'High School or equivalent', 'missing',
       "Bachelor's Degree", "Master's Degree",
       'Some College Coursework Completed', 'Associate Degree',
       'Professional', 'Certification', 'Vocational - HS Diploma',
       'Vocational', 'Doctorate', 'Some High School Coursework',
       'Vocational - Degree'], dtype=object)

In [32]:
train_df['telecommuting'].unique()

array([0, 1])

In [33]:
train_df['required_experience'].unique()

array(['Entry level', 'Mid-Senior level', 'missing', 'Associate',
       'Not Applicable', 'Executive', 'Director', 'Internship'],
      dtype=object)

In [34]:
train_df.shape

(14304, 36)

For the descriptive text fields, gonna use SBERT to preserve the semantic meaning, so that my model won't be dumb ;) ..

In [35]:
!pip install -q sentence-transformers xgboost


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [36]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, classification_report
import xgboost as xgb


## Encoding the features..

In [37]:
import torch
# MiniLM is fast and accurate
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Ensure GPU usage if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
sbert_model = sbert_model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [38]:
# Define long text columns for semantic embedding
long_text_cols = ['clean_description', 'clean_requirements', 'clean_company_profile',\
                  'clean_benefits'
                  ]

# Apply SBERT embedding to each column and stack
def sbert_encode(df, col):
    texts = df[col].fillna('').tolist()
    return sbert_model.encode(texts, show_progress_bar=True)

# Create SBERT embeddings and concatenate them
sbert_embeddings = np.hstack([sbert_encode(train_df, col) for col in long_text_cols])


Batches:   0%|          | 0/447 [00:00<?, ?it/s]

Batches:   0%|          | 0/447 [00:00<?, ?it/s]

Batches:   0%|          | 0/447 [00:00<?, ?it/s]

Batches:   0%|          | 0/447 [00:00<?, ?it/s]

In [39]:
sbert_embeddings

array([[ 0.0154575 , -0.17069474,  0.08136722, ..., -0.6541781 ,
         0.0893632 ,  0.22160357],
       [-0.14306755, -0.14279917,  0.00204702, ..., -0.09824523,
         0.42682087,  0.2178519 ],
       [ 0.12826586, -0.03878329, -0.20056129, ..., -0.09824523,
         0.42682087,  0.2178519 ],
       ...,
       [ 0.37198782,  0.23163038, -0.5056609 , ..., -0.09824535,
         0.42682058,  0.217852  ],
       [-0.3483018 , -0.14415611, -0.45495147, ..., -0.09824535,
         0.42682058,  0.217852  ],
       [ 0.10086162,  0.21416064,  0.05010221, ..., -0.09824535,
         0.42682058,  0.217852  ]], dtype=float32)

In [40]:
#Normalize SBERT embeddings
scaler = StandardScaler()
sbert_embeddings_scaled = scaler.fit_transform(sbert_embeddings)

In [41]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['location', 'employment_type']

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = train_df[col].astype(str)
    train_df[col] = le.fit_transform(train_df[col])
    label_encoders[col] = le

In [42]:
le_title = LabelEncoder()
train_df['title_le'] = le_title.fit_transform(train_df['title'].fillna('missing'))

le_department = LabelEncoder()
train_df['department_le'] = le_department.fit_transform(train_df['department'].fillna('missing'))

In [43]:
!pip install category_encoders

# from category_encoders import TargetEncoder

# target_encoder = TargetEncoder()
# train_df['industry_te'] = target_encoder.fit_transform(train_df['industry'], train_df['fraudulent'])


Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [44]:
from category_encoders import TargetEncoder

# Fitting on train_df
target_encoder_industry = TargetEncoder()
train_df['industry_te'] = target_encoder_industry.fit_transform(train_df['industry'], train_df['fraudulent'])

target_encoder_function = TargetEncoder()
train_df['function_te'] = target_encoder_function.fit_transform(train_df['function'], train_df['fraudulent'])

target_encoder_education = TargetEncoder()
train_df['required_education_te'] = target_encoder_education.fit_transform(train_df['required_education'], train_df['fraudulent'])

target_encoder_req_exp = TargetEncoder()
train_df['required_experience'] = target_encoder_req_exp.fit_transform(train_df['required_experience'], train_df['fraudulent'])

In [45]:
# train_df['function_te'] = target_encoder.fit_transform(
#     train_df['function'],
#     train_df['fraudulent']
# )

In [46]:
# education_encoder = TargetEncoder()
# train_df['required_education_te'] = education_encoder.fit_transform(
#     train_df['required_education'],
#     train_df['fraudulent']
# )

In [47]:
# train_df['required_experience'] = target_encoder.fit_transform(train_df['required_experience'], train_df['fraudulent'])

In [48]:
train_df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'department_missing', 'salary_range_missing',
       'company_profile_missing', 'description_missing',
       'requirements_missing', 'benefits_missing', 'location_missing',
       'employment_type_missing', 'required_experience_missing',
       'required_education_missing', 'industry_missing', 'function_missing',
       'clean_description', 'clean_requirements', 'clean_company_profile',
       'clean_benefits', 'salary_min', 'salary_max', 'title_le',
       'department_le', 'industry_te', 'function_te', 'required_education_te'],
      dtype='object')

In [49]:
train_df.shape

(14304, 41)

In [50]:
structured_cols = [
    'telecommuting', 'has_company_logo', 'has_questions',
    'salary_min', 'salary_max',
    'industry_te', 'function_te', 'required_education_te',
    'title_le', 'department_le', 'location', 'employment_type'
] + [col for col in train_df.columns if col.endswith('_missing')]


In [51]:
structured_data = train_df[structured_cols].values

In [52]:
train_df[structured_cols].dtypes


Unnamed: 0,0
telecommuting,int64
has_company_logo,int64
has_questions,int64
salary_min,float64
salary_max,float64
industry_te,float64
function_te,float64
required_education_te,float64
title_le,int64
department_le,int64


In [53]:
# Normalize structured data
structured_scaler = StandardScaler()
structured_data_scaled = structured_scaler.fit_transform(structured_data)


In [54]:
train_df.columns


Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'department_missing', 'salary_range_missing',
       'company_profile_missing', 'description_missing',
       'requirements_missing', 'benefits_missing', 'location_missing',
       'employment_type_missing', 'required_experience_missing',
       'required_education_missing', 'industry_missing', 'function_missing',
       'clean_description', 'clean_requirements', 'clean_company_profile',
       'clean_benefits', 'salary_min', 'salary_max', 'title_le',
       'department_le', 'industry_te', 'function_te', 'required_education_te'],
      dtype='object')

In [55]:
train_df = train_df.drop(columns=['job_id'])

# Final training input
X = np.hstack([sbert_embeddings_scaled, structured_data_scaled])

# Target variable
y = train_df['fraudulent'].values

## Understanding the target variable

In [56]:
train_df['fraudulent'].unique()

array([0, 1])

In [57]:
train_df['fraudulent'].value_counts()


Unnamed: 0_level_0,count
fraudulent,Unnamed: 1_level_1
0,13611
1,693


In [58]:
train_df.isna().sum()

Unnamed: 0,0
title,0
location,0
department,0
salary_range,0
company_profile,0
description,0
requirements,0
benefits,0
telecommuting,0
has_company_logo,0


Oh, this dataset is highly imbalanced. SO I am going to use SMOTE to oversample fraud class in training set

In [59]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


## now working on the Test dataset

In [60]:
# Text columns — fill with empty string
text_cols = ['department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits']
test_df[text_cols] = test_df[text_cols].fillna('')

# Categorical columns — fill with 'missing'
cat_cols = ['location', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']
test_df[cat_cols] = test_df[cat_cols].fillna('missing')

# Add missing indicators
for col in text_cols + cat_cols:
    test_df[f'{col}_missing'] = test_df[col].apply(lambda x: 1 if x in ['', 'missing'] else 0)


In [61]:
test_df['clean_description'] = test_df['description'].fillna('').apply(preprocess)
test_df['clean_requirements'] = test_df['requirements'].fillna('').apply(preprocess)
test_df['clean_company_profile'] = test_df['company_profile'].fillna('').apply(preprocess)
test_df['clean_benefits'] = test_df['benefits'].fillna('').apply(preprocess)


In [62]:
# Using the same long_text_cols as train
long_text_cols = ['clean_description', 'clean_requirements', 'clean_company_profile', 'clean_benefits']

# Apply SBERT encoding
def sbert_encode(df, col):
    texts = df[col].fillna('').tolist()
    return sbert_model.encode(texts, show_progress_bar=True)

In [63]:
sbert_embeddings_test = np.hstack([sbert_encode(test_df, col) for col in long_text_cols])


Batches:   0%|          | 0/112 [00:00<?, ?it/s]

Batches:   0%|          | 0/112 [00:00<?, ?it/s]

Batches:   0%|          | 0/112 [00:00<?, ?it/s]

Batches:   0%|          | 0/112 [00:00<?, ?it/s]

In [88]:
sbert_embeddings_test_scaled = scaler.transform(sbert_embeddings_test)

In [89]:
for col in ['location', 'employment_type']:
    le = label_encoders[col]  # Use fitted encoder from train
    known_classes = set(le.classes_)

    # Handle unseen labels by mapping them to -1
    test_df[col] = test_df[col].astype(str).apply(lambda x: le.transform([x])[0] if x in known_classes else -1)


In [90]:
# Transform on test_df using already fitted encoders
test_df['industry_te'] = target_encoder_industry.transform(test_df['industry'])
test_df['function_te'] = target_encoder_function.transform(test_df['function'])
test_df['required_education_te'] = target_encoder_education.transform(test_df['required_education'])
test_df['required_experience_te'] = target_encoder_req_exp.transform(test_df['required_experience'])

In [91]:
le_title = LabelEncoder()
test_df['title_le'] = le_title.fit_transform(test_df['title'].fillna('missing'))

le_department = LabelEncoder()
test_df['department_le'] = le_department.fit_transform(test_df['department'].fillna('missing'))

In [92]:
salary_split = test_df['salary_range'].fillna('').apply(parse_salary)
test_df['salary_min'] = salary_split.apply(lambda x: x[0])
test_df['salary_max'] = salary_split.apply(lambda x: x[1])


# Fill salary min/max with -1 just like train
test_df['salary_min'] = test_df['salary_min'].fillna(-1)
test_df['salary_max'] = test_df['salary_max'].fillna(-1)


In [69]:
test_df = test_df.drop(columns=['job_id'])


In [93]:
structured_test = test_df[structured_cols].values
structured_test_scaled = structured_scaler.transform(structured_test)  # same scaler


In [94]:
X_test = np.hstack([sbert_embeddings_test_scaled, structured_test_scaled])


In [95]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Train the XGBoost model
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

# xgb_model.fit(X_resampled, y_resampled)


In [73]:
# Predict on the test set
# y_pred = xgb_model.predict(X_test)

In [74]:
# y_pred

In [96]:
from sklearn.model_selection import train_test_split

# Split after SMOTE
X_train, X_val, y_train, y_val = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)


In [97]:
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_val)


Parameters: { "use_label_encoder" } are not used.



In [98]:
y_pred

array([1, 0, 1, ..., 1, 0, 0])

In [99]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("Accuracy Score:", accuracy_score(y_val, y_pred))


Confusion Matrix:
 [[2699   24]
 [   6 2716]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      2723
           1       0.99      1.00      0.99      2722

    accuracy                           0.99      5445
   macro avg       0.99      0.99      0.99      5445
weighted avg       0.99      0.99      0.99      5445

Accuracy Score: 0.9944903581267218


F1 Score is 0.99

# Testing on test_df

In [85]:
test_df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'department_missing', 'salary_range_missing', 'company_profile_missing',
       'description_missing', 'requirements_missing', 'benefits_missing',
       'location_missing', 'employment_type_missing',
       'required_experience_missing', 'required_education_missing',
       'industry_missing', 'function_missing', 'clean_description',
       'clean_requirements', 'clean_company_profile', 'clean_benefits',
       'industry_te', 'function_te', 'required_education_te',
       'required_experience_te', 'title_le', 'department_le', 'salary_min',
       'salary_max'],
      dtype='object')

In [101]:
test_probabilities = xgb_model.predict_proba(X_test)[:, 1]
test_predictions = xgb_model.predict(X_test)


In [102]:
test_df['fraud_probability'] = test_probabilities
test_df['predicted_label'] = test_predictions


In [103]:
test_df[['title', 'predicted_label', 'fraud_probability']].to_csv('test_predictions.csv', index=False)


# Saving the models

In [83]:
!pip install joblib



In [104]:
import joblib

# Save XGBoost model
joblib.dump(xgb_model, 'xgb_model.pkl')


['xgb_model.pkl']

In [105]:
# Save your fitted scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [81]:
#sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
sbert_model.save('sbert_encoder')  # This saves the model directory
