In [None]:
import pandas as pd
import re, string
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Load dataset
csv_path = "fake_job_postings.csv"
df = pd.read_csv(csv_path)

print("columns:", df.columns.tolist())
print("shape:", df.shape)

# Drop missing descriptions
df = df.dropna(subset=['description']).reset_index(drop=True)
print("After dropping rows with empty description:", df.shape)

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords + short words, lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in STOPWORDS and len(w) > 2]
    return " ".join(tokens)

# Apply cleaning
df['final_text'] = df['description'].apply(clean_text)
df[['description','final_text']].head(10)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


columns: ['job_id', 'title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'fraudulent']
shape: (17880, 18)
After dropping rows with empty description: (17879, 18)


Unnamed: 0,description,final_text
0,"Food52, a fast-growing, James Beard Award-winn...",food fastgrowing james beard awardwinning onli...
1,Organised - Focused - Vibrant - Awesome!Do you...,organised focused vibrant awesomedo passion cu...
2,"Our client, located in Houston, is actively se...",client located houston actively seeking experi...
3,THE COMPANY: ESRI – Environmental Systems Rese...,company esri environmental system research ins...
4,JOB TITLE: Itemization Review ManagerLOCATION:...,job title itemization review managerlocation f...
5,Job OverviewApex is an environmental consultin...,job overviewapex environmental consulting firm...
6,Your Responsibilities: Manage the English-spea...,responsibility manage englishspeaking editoria...
7,Who is Airenvy?Hey there! We are seasoned entr...,airenvyhey seasoned entrepreneur heart san fra...
8,Implementation/Configuration/Testing/Training ...,implementationconfigurationtestingtraining onh...
9,The Customer Service Associate will be based i...,customer service associate based phoenix right...


In [39]:
print(df['fraudulent'].value_counts())


fraudulent
0    17014
1      865
Name: count, dtype: int64


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Features and target
X = df['final_text']
y = df['fraudulent']   # target column in dataset

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # limit to top 5000 words
X_tfidf = vectorizer.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42)

# Logistic Regression Model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

print("Training data:\n",X_train,y_train)
print("Testing data:",X_test,y_test)
print("Predictions:",y_pred)
print("Actual:",y_test)

Training data:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1014738 stored elements and shape (14303, 5000)>
  Coords	Values
  (0, 1722)	0.10057665821297346
  (0, 2144)	0.12374418776305245
  (0, 1081)	0.06992296337700239
  (0, 4944)	0.037260168503728604
  (0, 4430)	0.06915491269948179
  (0, 2921)	0.044415308352503534
  (0, 4369)	0.053068544214653526
  (0, 1085)	0.1784884473336119
  (0, 4957)	0.06799058123088117
  (0, 2622)	0.12560274701411134
  (0, 3285)	0.12843270670236215
  (0, 1997)	0.06106117450319668
  (0, 824)	0.042764044044951766
  (0, 1582)	0.07481231981970486
  (0, 2212)	0.05578786778496482
  (0, 3190)	0.1306017910362878
  (0, 1640)	0.07100309607997304
  (0, 4309)	0.06307744734293966
  (0, 2839)	0.19947650083745425
  (0, 1602)	0.08625135547750037
  (0, 3156)	0.057089697601112155
  (0, 2241)	0.0627460483791099
  (0, 3033)	0.06873803634268227
  (0, 3189)	0.19890018281445201
  (0, 3685)	0.07280862094326682
  :	:
  (14302, 3297)	0.08157274355143979
  (14302, 2047