In [21]:
import pandas as pd
import joblib
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report



In [22]:
nltk.download('stopwords')
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
def clean_text(text):
    text = str(text).lower()
    tokens = [w for w in text.split() if w not in STOP_WORDS]
    return ' '.join(tokens)

In [24]:
import os
print("Current working directory:", os.getcwd())


Current working directory: c:\Users\adity\OneDrive\Desktop\Fraud_job\model


In [25]:
if __name__ == '__main__':
    df = pd.read_csv('../datasets/fake_job_postings.csv')
    df = df[['description', 'fraudulent']].dropna()
    df['description_clean'] = df['description'].apply(clean_text)

    X = df['description_clean']
    y = df['fraudulent']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    vect = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vect.fit_transform(X_train)
    X_test_tfidf = vect.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    y_pred = model.predict(X_test_tfidf)
    print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3394
           1       1.00      0.35      0.52       182

    accuracy                           0.97      3576
   macro avg       0.98      0.68      0.75      3576
weighted avg       0.97      0.97      0.96      3576



In [26]:
import os
import joblib

# Ensure artifacts directory exists
os.makedirs('artifacts', exist_ok=True)

# Save the vectorizer and model
joblib.dump(vect, 'artifacts/tfidf_vectorizer.joblib')
joblib.dump(model, 'artifacts/logreg_model.joblib')


['artifacts/logreg_model.joblib']