In [229]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [230]:
#Loading dataset
df = pd.read_csv(r"fake_job_postings.csv")
df

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [231]:
#Understanding data quality issue that might need imputation (missing values)
df.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [232]:
#Checking distribution of target variable to check for class imbalance and decide techniques for model evaluation 
df['fraudulent'].value_counts(normalize=True)

fraudulent
0    0.951566
1    0.048434
Name: proportion, dtype: float64

In [233]:
# Encodes department as binary flag (has_salary) and 
# fraud risk (salary_fraud_risk), drops original column to handle missingness and preserve fraud patterns.
# Keeping salaray_range as it is a good indicator for fake job posting

df['has_salary'] = df['salary_range'].notna().astype(int)
dept_fraud_rates = df.groupby('salary_range')['fraudulent'].mean().to_dict()
df['salary_fraud_risk'] = df['salary_range'].map(dept_fraud_rates)
df['salary_fraud_risk']= df['salary_fraud_risk'].fillna(0)  # Impute missing as 0% risk
df = df.drop(columns=['salary_range'])

In [234]:
# Dropping these coulms as job_id is an identifier and department has many missing values
df = df.drop(columns=['job_id','department'])
df

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,has_salary,salary_fraud_risk
0,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,0,0.0
1,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,0,0.0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,0,0.0
3,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,0,0.0
4,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,"CA, ON, Toronto",Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0,0,0.0
17876,Payroll Accountant,"US, PA, Philadelphia",WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,0,0.0
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0,0,0.0
17878,Graphic Designer,"NG, LA, Lagos",,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,0,0.0


In [235]:
# description is an critical feature for fraud detection (they contain key patterns/scam phrases). 
# So only including job posting with description not missing

df = df[df['description'].notna()]
df

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,has_salary,salary_fraud_risk
0,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,0,0.0
1,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,0,0.0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,0,0.0
3,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,0,0.0
4,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,"CA, ON, Toronto",Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0,0,0.0
17876,Payroll Accountant,"US, PA, Philadelphia",WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,0,0.0
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0,0,0.0
17878,Graphic Designer,"NG, LA, Lagos",,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,0,0.0


In [236]:
df.columns

Index(['title', 'location', 'company_profile', 'description', 'requirements',
       'benefits', 'telecommuting', 'has_company_logo', 'has_questions',
       'employment_type', 'required_experience', 'required_education',
       'industry', 'function', 'fraudulent', 'has_salary',
       'salary_fraud_risk'],
      dtype='object')

In [237]:
df = df.copy()

# Fill categorical columns with 'Unknown' or 'Not specified' for maintaining consistency for one-hot encoding
# and avoiding dropping rows for minor missing categorical data
df['employment_type'] = df['employment_type'].fillna('Unknown')
df['required_experience'] = df['required_experience'].fillna('Not specified')
df['required_education'] = df['required_education'].fillna('Not specified')
df['industry'] = df['industry'].fillna('Unknown')
df['function'] = df['function'].fillna('Unknown')
df['location'] = df['location'].fillna('Unknown')

# Fill text-based columns with empty string
df['company_profile'] = df['company_profile'].fillna('')
df['requirements'] = df['requirements'].fillna('')
df['benefits'] = df['benefits'].fillna('')


In [238]:
for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'location']:
    print(f"{col}: {df[col].nunique()} unique values")


employment_type: 6 unique values
required_experience: 8 unique values
required_education: 14 unique values
industry: 132 unique values
function: 38 unique values
location: 3106 unique values


In [239]:
# One hot encoding
# One-hot encode specified categorical columns as less unique values
df = pd.get_dummies(df, columns=['employment_type', 'required_experience', 'required_education'], drop_first=True)
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

# Label Encoding for features having many unique values
le = LabelEncoder()

for col in ['industry', 'function', 'location']:
    df[col] = le.fit_transform(df[col])

In [240]:
df

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,industry,...,required_education_High School or equivalent,required_education_Master's Degree,required_education_Not specified,required_education_Professional,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma
0,Marketing Intern,2535,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,123,...,0,0,1,0,0,0,0,0,0,0
1,Customer Service - Cloud Video Production,1073,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,75,...,0,0,1,0,0,0,0,0,0,0
2,Commissioning Machinery Assistant (CMA),1867,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,123,...,0,0,1,0,0,0,0,0,0,0
3,Account Executive - Washington DC,1703,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,22,...,0,0,0,0,0,0,0,0,0,0
4,Bill Review Manager,1741,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,51,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,166,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,22,...,0,0,1,0,0,0,0,0,0,0
17876,Payroll Accountant,2734,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,61,...,0,0,0,0,0,0,0,0,0,0
17877,Project Cost Control Staff Engineer - Cost Con...,2875,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,123,...,0,0,1,0,0,0,0,0,0,0
17878,Graphic Designer,1045,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,48,...,0,0,0,1,0,0,0,0,0,0


In [241]:
df.dtypes

title                                                    object
location                                                  int32
company_profile                                          object
description                                              object
requirements                                             object
benefits                                                 object
telecommuting                                             int64
has_company_logo                                          int64
has_questions                                             int64
industry                                                  int32
function                                                  int32
fraudulent                                                int64
has_salary                                                int32
salary_fraud_risk                                       float64
employment_type_Full-time                                 int32
employment_type_Other                   

In [242]:
#Text Processing

df['description'] = df['description'].str.lower()
df['requirements'] = df['requirements'].str.lower()
df['benefits'] = df['benefits'].str.lower()
df['company_profile'] = df['company_profile'].str.lower()

def clean_text(text):
    return re.sub(r'[^a-z\s]', '', text)

df['description'] = df['description'].apply(clean_text)
df['requirements'] = df['requirements'].apply(clean_text)
df['benefits'] = df['benefits'].apply(clean_text)
df['company_profile'] = df['company_profile'].apply(clean_text)


In [243]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['description'] = df['description'].apply(remove_stopwords)
df['requirements'] = df['requirements'].apply(remove_stopwords)
df['benefits'] = df['benefits'].apply(remove_stopwords)
df['company_profile'] = df['company_profile'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [244]:
df

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,industry,...,required_education_High School or equivalent,required_education_Master's Degree,required_education_Not specified,required_education_Professional,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma
0,Marketing Intern,2535,food weve created groundbreaking awardwinning ...,food fastgrowing james beard awardwinning onli...,experience content management systems major pl...,,0,1,0,123,...,0,0,1,0,0,0,0,0,0,0
1,Customer Service - Cloud Video Production,1073,seconds worlds cloud video production service ...,organised focused vibrant awesomedo passion cu...,expect youyour key responsibility communicate ...,get usthrough part seconds team gainexperience...,0,1,0,75,...,0,0,1,0,0,0,0,0,0,0
2,Commissioning Machinery Assistant (CMA),1867,valor services provides workforce solutions me...,client located houston actively seeking experi...,implement precommissioning commissioning proce...,,0,1,0,123,...,0,0,1,0,0,0,0,0,0,0
3,Account Executive - Washington DC,1703,passion improving quality life geography heart...,company esri environmental systems research in...,education bachelors masters gis business admin...,culture anything corporatewe collaborative cre...,0,1,0,22,...,0,0,0,0,0,0,0,0,0,0
4,Bill Review Manager,1741,spotsource solutions llc global human capital ...,job title itemization review managerlocation f...,qualificationsrn license state texasdiploma ba...,full benefits offered,0,1,1,51,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,166,vend looking awesome new talent come join us y...,case first time youve visited website vend awa...,ace role youwill eat comprehensive statements ...,expect uswe open culture openly share results ...,0,1,1,22,...,0,0,1,0,0,0,0,0,0,0
17876,Payroll Accountant,2734,weblinc ecommerce platform services provider f...,payroll accountant focus primarily payroll fun...,ba bs accounting desire fun love genuine passi...,health amp wellnessmedical planprescription dr...,0,1,1,61,...,0,0,0,0,0,0,0,0,0,0
17877,Project Cost Control Staff Engineer - Cost Con...,2875,provide full time permanent positions many med...,experienced project cost control staff enginee...,least years professional experienceability wor...,,0,0,0,123,...,0,0,1,0,0,0,0,0,0,0
17878,Graphic Designer,1045,,nemsia studios looking experienced visualgraph...,must fluent latest versions corel amp adobe cc...,competitive salary compensation based experien...,0,0,1,48,...,0,0,0,1,0,0,0,0,0,0


In [245]:
df['text'] = df['description'] + ' ' + df['requirements'] + ' ' + df['benefits'] + ' ' + df['company_profile']
df

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,industry,...,required_education_Master's Degree,required_education_Not specified,required_education_Professional,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma,text
0,Marketing Intern,2535,food weve created groundbreaking awardwinning ...,food fastgrowing james beard awardwinning onli...,experience content management systems major pl...,,0,1,0,123,...,0,1,0,0,0,0,0,0,0,food fastgrowing james beard awardwinning onli...
1,Customer Service - Cloud Video Production,1073,seconds worlds cloud video production service ...,organised focused vibrant awesomedo passion cu...,expect youyour key responsibility communicate ...,get usthrough part seconds team gainexperience...,0,1,0,75,...,0,1,0,0,0,0,0,0,0,organised focused vibrant awesomedo passion cu...
2,Commissioning Machinery Assistant (CMA),1867,valor services provides workforce solutions me...,client located houston actively seeking experi...,implement precommissioning commissioning proce...,,0,1,0,123,...,0,1,0,0,0,0,0,0,0,client located houston actively seeking experi...
3,Account Executive - Washington DC,1703,passion improving quality life geography heart...,company esri environmental systems research in...,education bachelors masters gis business admin...,culture anything corporatewe collaborative cre...,0,1,0,22,...,0,0,0,0,0,0,0,0,0,company esri environmental systems research in...
4,Bill Review Manager,1741,spotsource solutions llc global human capital ...,job title itemization review managerlocation f...,qualificationsrn license state texasdiploma ba...,full benefits offered,0,1,1,51,...,0,0,0,0,0,0,0,0,0,job title itemization review managerlocation f...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,166,vend looking awesome new talent come join us y...,case first time youve visited website vend awa...,ace role youwill eat comprehensive statements ...,expect uswe open culture openly share results ...,0,1,1,22,...,0,1,0,0,0,0,0,0,0,case first time youve visited website vend awa...
17876,Payroll Accountant,2734,weblinc ecommerce platform services provider f...,payroll accountant focus primarily payroll fun...,ba bs accounting desire fun love genuine passi...,health amp wellnessmedical planprescription dr...,0,1,1,61,...,0,0,0,0,0,0,0,0,0,payroll accountant focus primarily payroll fun...
17877,Project Cost Control Staff Engineer - Cost Con...,2875,provide full time permanent positions many med...,experienced project cost control staff enginee...,least years professional experienceability wor...,,0,0,0,123,...,0,1,0,0,0,0,0,0,0,experienced project cost control staff enginee...
17878,Graphic Designer,1045,,nemsia studios looking experienced visualgraph...,must fluent latest versions corel amp adobe cc...,competitive salary compensation based experien...,0,0,1,48,...,0,0,1,0,0,0,0,0,0,nemsia studios looking experienced visualgraph...


In [246]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

# TF-IDF for text features
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_text = tfidf.fit_transform(df['text'])

# Prepare numeric fearures
X_other = df.drop(columns=['description', 'requirements', 'benefits', 'company_profile', 'text', 'fraudulent'])
y = df['fraudulent']

# Convert other features to numeric (example - you might need more sophisticated preprocessing)
X_other = X_other.apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(0)

# Convert to sparse matrix and combine
X_other_sparse = csr_matrix(X_other.values)
X_final = hstack([X_text, X_other_sparse])

In [247]:
import pandas as pd

# Get feature names from TF-IDF and other columns
feature_names = tfidf.get_feature_names_out().tolist() + X_other.columns.tolist()

# Convert to DataFrame
X_final_df = pd.DataFrame.sparse.from_spmatrix(X_final, columns=feature_names)
X_final_df.head()

Unnamed: 0,aan,ab,abc,abilities,ability,able,abreast,abroad,absolute,absolutely,...,required_education_High School or equivalent,required_education_Master's Degree,required_education_Not specified,required_education_Professional,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma
0,0,0,0,0,0.0,0.0,0,0,0,0,...,0,0,1.0,0,0,0,0,0,0,0
1,0,0,0,0,0.0,0.013399,0,0,0,0,...,0,0,1.0,0,0,0,0,0,0,0
2,0,0,0,0,0.0,0.0,0,0,0,0,...,0,0,1.0,0,0,0,0,0,0,0
3,0,0,0,0,0.018513,0.0,0,0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
4,0,0,0,0,0.0,0.028434,0,0,0,0,...,0,0,0.0,0,0,0,0,0,0,0


In [249]:
#Saving preprocessed data
from scipy.sparse import save_npz
import joblib
import pandas as pd

save_npz('X_final.npz', X_final)
pd.DataFrame(y).to_csv('y_target.csv', index=False)
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
pd.Series(X_other.columns).to_csv('numeric_feature_names.csv', index=False)

print("Files saved directly in:", os.getcwd())
print(os.listdir())  

Files saved directly in: C:\Users\adana
['.anaconda', '.android', '.conda', '.condarc', '.continuum', '.dacfx', '.emulator_console_auth_token', '.gitconfig', '.gradle', '.idlerc', '.ipynb_checkpoints', '.ipython', '.jupyter', '.lesshst', '.matplotlib', '.packettracer', '.skiko', '.streamlit', '.TurboVPN', '.VirtualBox', '.vscode', 'Adan-Akbar-Data-Engineering-BWF-', 'adb', 'Airline Dataset.csv', 'airline_dataset.csv', 'anaconda3', 'AndroidStudioProjects', 'AppData', 'Application Data', 'BackwardElimination.ipynb', 'breast-cancer.csv', 'Cars.xlsx', 'Cisco Packet Tracer 8.2.2', 'Clustering.ipynb', 'collaborative-project', 'Contacts', 'Cookies', 'Data Preprocessing.ipynb', 'Documents', 'Downloads', 'ElectricCarData_Clean.csv', 'Exploring Regression Model.ipynb', 'fake_job_postings.csv', 'Favorites', 'ForwardBackward.ipynb', 'ForwardSelection.ipynb', 'IDS_Course_Material', 'IMDB Dataset.csv', 'Introduction To Data Science', 'iris.csv', 'Links', 'Local Settings', 'LogisticRegression.ipynb',