<a href="https://colab.research.google.com/github/akkisrihari/fake-job-predictions/blob/main/Fraudulentjobdetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [97]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# ─── Step 1: Load Data ─────────────────────────────────────────────────────────
train_url = "https://coding-platform.s3.amazonaws.com/dev/lms/tickets/4c8465f0-fce0-484f-8497-d25feaa8e995/NqndMEyZakuimmFI.csv"
test_url  = "https://coding-platform.s3.amazonaws.com/dev/lms/tickets/cab5b1bf-9132-4399-8ed5-2c049fcc89f8/0tkf3jUGLYjCEJGz.csv"

# ✅ Download the training and test datasets to the Colab environment
!wget -O NqndMEyZakuimmFI.csv "https://coding-platform.s3.amazonaws.com/dev/lms/tickets/4c8465f0-fce0-484f-8497-d25feaa8e995/NqndMEyZakuimmFI.csv"
!wget -O 0tkf3jUGLYjCEJGz.csv "https://coding-platform.s3.amazonaws.com/dev/lms/tickets/cab5b1bf-9132-4399-8ed5-2c049fcc89f8/0tkf3jUGLYjCEJGz.csv"

train_df = pd.read_csv(train_url)
test_df  = pd.read_csv(test_url)

print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# ─── Step 2: Preprocessing Function ───────────────────────────────────────────
def preprocess(df):
    df = df.copy()
    # Combine key text columns and fill NaNs
    cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
    df['text'] = df[cols].fillna('').agg(' '.join, axis=1)
    # Clean whitespace
    df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))
    return df

train_p = preprocess(train_df)
test_p  = preprocess(test_df)

# ─── Step 3: Vectorization & Model Training ─────────────────────────────────
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_p['text'])
y_train = train_df['fraudulent']

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

# Optional: Check training F1-score
train_preds = model.predict(X_train)
print("Training F1-score:", f1_score(y_train, train_preds))

# ─── Step 4: Predict on Test Set ─────────────────────────────────────────────
X_test = vectorizer.transform(test_p['text'])
test_df['fraud_probability']   = model.predict_proba(X_test)[:, 1]
test_df['predicted_fraudulent'] = model.predict(X_test)

# ─── Step 5: View Top-10 Suspicious Listings ────────────────────────────────
top10 = test_df.sort_values(by='fraud_probability', ascending=False).head(10)
print(top10[['title', 'fraud_probability']])


--2025-06-15 14:40:07--  https://coding-platform.s3.amazonaws.com/dev/lms/tickets/4c8465f0-fce0-484f-8497-d25feaa8e995/NqndMEyZakuimmFI.csv
Resolving coding-platform.s3.amazonaws.com (coding-platform.s3.amazonaws.com)... 3.5.210.228, 52.219.158.99, 3.5.208.8, ...
Connecting to coding-platform.s3.amazonaws.com (coding-platform.s3.amazonaws.com)|3.5.210.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39855051 (38M) [text/csv]
Saving to: ‘NqndMEyZakuimmFI.csv’


2025-06-15 14:40:12 (8.61 MB/s) - ‘NqndMEyZakuimmFI.csv’ saved [39855051/39855051]

--2025-06-15 14:40:12--  https://coding-platform.s3.amazonaws.com/dev/lms/tickets/cab5b1bf-9132-4399-8ed5-2c049fcc89f8/0tkf3jUGLYjCEJGz.csv
Resolving coding-platform.s3.amazonaws.com (coding-platform.s3.amazonaws.com)... 52.219.66.20, 3.5.210.34, 16.12.40.15, ...
Connecting to coding-platform.s3.amazonaws.com (coding-platform.s3.amazonaws.com)|52.219.66.20|:443... connected.
HTTP request sent, awaiting response... 200 

In [98]:
code = '''
# Sample Python code in app.py

def greet():
    print("Hello from app.py!")

if __name__ == "__main__":
    greet()
'''

# Save it to a file
with open('app.py', 'w') as f:
    f.write(code)

print("✅ app.py has been created.")


✅ app.py has been created.


In [99]:
! pip install shap



In [100]:
! pip install streamlit



In [111]:
import streamlit as st
st.set_page_config(page_title="Fake Job Detector", layout="wide")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import shap
from wordcloud import WordCloud
import smtplib
from email.mime.text import MIMEText
import joblib

# Load original data
@st.cache_data
def load_data():
    train_df = pd.read_csv('NqndMEyZakuimmFI.csv')
    test_df = pd.read_csv('0tkf3jUGLYjCEJGz.csv')
    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)
    return train_df, test_df

train_df, test_df = load_data()

# Combine text
def combine_text(data):
    return data['title'] + ' ' + data['company_profile'] + ' ' + data['description'] + ' ' + data['requirements'] + ' ' + data['benefits']

train_df['text'] = combine_text(train_df)
test_df['text'] = combine_text(test_df)

# Vectorize & Train function
def train_model(X_texts, y_labels):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_vec = vectorizer.fit_transform(X_texts)
    model = LogisticRegression()
    model.fit(X_vec, y_labels)
    joblib.dump(model, 'model.pkl')
    joblib.dump(vectorizer, 'vectorizer.pkl')
    return model, vectorizer

# Load or train model
try:
    model = joblib.load("model.pkl")
    vectorizer = joblib.load("vectorizer.pkl")
except:
    model, vectorizer = train_model(train_df['text'], train_df['fraudulent'])

# SHAP setup
explainer = shap.Explainer(model, vectorizer.transform(train_df['text']), feature_names=vectorizer.get_feature_names_out())

# 🔔 Email alert function
def send_email_alert(subject, body, to_email):
    from_email = "your_email@gmail.com"
    from_password = "your_app_password"
    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = from_email
    msg["To"] = to_email
    try:
        with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
            server.login(from_email, from_password)
            server.sendmail(from_email, to_email, msg.as_string())
        print("Email alert sent.")
    except Exception as e:
        print("Email failed:", e)

# UI
st.title("💼 Fake Job Posting Detector")

uploaded_file = st.file_uploader("📂 Upload a CSV file for prediction or retraining", type=["csv"])

retrain = st.checkbox("🔁 Retrain model with uploaded data")

if uploaded_file is not None:
    input_df = pd.read_csv(uploaded_file)
    input_df.fillna('', inplace=True)
    input_df['text'] = combine_text(input_df)

    if retrain:
        if 'fraudulent' in input_df.columns:
            st.success("✅ Retraining model...")
            model, vectorizer = train_model(input_df['text'], input_df['fraudulent'])
            explainer = shap.Explainer(model, vectorizer.transform(input_df['text']), feature_names=vectorizer.get_feature_names_out())
            st.success("🎉 Model retrained and saved successfully.")
        else:
            st.error("❌ 'fraudulent' column not found for training. Cannot retrain.")

    else:
        X_input = vectorizer.transform(input_df['text'])
        predictions = model.predict(X_input)
        input_df['prediction'] = predictions

        st.write("### 🔎 Prediction Results (0 = Real, 1 = Fake):")
        st.dataframe(input_df[['title', 'company_profile', 'prediction']])

        st.subheader("🧠 SHAP Explanation for First Prediction")
        shap_values = explainer(X_input[:1])
        shap.plots.bar(shap_values[0], show=False)
        st.pyplot(plt)

        csv = input_df[['title', 'company_profile', 'prediction']].to_csv(index=False)
        st.download_button("📥 Download Predictions", data=csv, file_name='predictions.csv', mime='text/csv')

        st.subheader("📊 Prediction Distribution")
        st.bar_chart(input_df['prediction'].value_counts())

        fake_jobs = input_df[input_df['prediction'] == 1]
        if len(fake_jobs) > 0:
            subject = "🚨 Alert: High-Risk Job Postings Detected"
            body = f"{len(fake_jobs)} suspicious job postings were detected.\n\nExamples:\n\n" + fake_jobs[['title', 'company_profile']].head(5).to_string()
            send_email_alert(subject, body, "your_email@gmail.com")
            st.warning(f"🚨 {len(fake_jobs)} fake jobs detected! Email alert sent.")

# 📊 Sidebar
st.sidebar.title("🔎 Dataset Explorer")
if st.sidebar.checkbox("Show Training Data"):
    st.write(train_df.head())
if st.sidebar.checkbox("Show Test Data"):
    st.write(test_df.head())
if st.sidebar.checkbox("Show Class Distribution"):
    st.bar_chart(train_df['fraudulent'].value_counts())
if st.sidebar.checkbox("Show Wordcloud (Fake Jobs)"):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(
        ' '.join(train_df[train_df['fraudulent'] == 1]['text']))
    st.image(wordcloud.to_array(), caption="WordCloud of Fake Job Posts")


2025-06-15 15:01:55.482 No runtime found, using MemoryCacheStorageManager
2025-06-15 15:01:55.498 No runtime found, using MemoryCacheStorageManager


In [102]:
!pip install streamlit pyngrok --quiet


In [103]:
!ngrok config add-authtoken 2yUKeEGVuF30pGKtmhXLfZQTm5d_3FL38DBisq4TsCSfV2G68


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [104]:
from pyngrok import ngrok
ngrok.kill()  # This kills any previously running tunnels



In [105]:
from pyngrok import ngrok
ngrok.kill()
public_url = ngrok.connect(8501)
print(f"Streamlit app running at: {public_url}")
!streamlit run app.py &> /dev/null &


Streamlit app running at: NgrokTunnel: "https://ea7f-34-73-179-53.ngrok-free.app" -> "http://localhost:8501"


In [106]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Load your training dataset
url = "https://coding-platform.s3.amazonaws.com/dev/lms/tickets/4c8465f0-fce0-484f-8497-d25feaa8e995/NqndMEyZakuimmFI.csv"
df = pd.read_csv(url)

# Fill missing values
df.fillna('', inplace=True)

# Combine text fields
def combine_text(data):
    return data['title'] + ' ' + data['company_profile'] + ' ' + data['description'] + ' ' + data['requirements'] + ' ' + data['benefits']

df['text'] = combine_text(df)

# Vectorize the text
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])

# Labels
y = df['fraudulent']

# Train the model
model = LogisticRegression()
model.fit(X, y)

# Save the model and vectorizer to use in FastAPI
joblib.dump(model, "model.joblib")
joblib.dump(vectorizer, "vectorizer.joblib")

print("✅ Model and vectorizer saved!")


✅ Model and vectorizer saved!


In [107]:
%%writefile main.py
from fastapi import FastAPI, UploadFile, File
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import io

app = FastAPI()

# Load model and vectorizer
try:
    model = joblib.load("model.joblib")
    vectorizer = joblib.load("vectorizer.joblib")
except FileNotFoundError:
    raise RuntimeError("Model or vectorizer not found. Please train and save them as 'model.joblib' and 'vectorizer.joblib'.")

def combine_text(df):
    return df['title'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']

@app.post("/predict/")
async def predict(file: UploadFile = File(...)):
    content = await file.read()
    df = pd.read_csv(io.StringIO(content.decode("utf-8")))

    # Handle missing values
    df.fillna('', inplace=True)

    # Combine text fields
    df['text'] = combine_text(df)

    # Vectorize
    X_input = vectorizer.transform(df['text'])

    # Predict
    preds = model.predict(X_input)
    df['prediction'] = preds

    # Return results
    return df[['title', 'company_profile', 'prediction']].to_dict(orient='records')


Overwriting main.py
