In [None]:
#importing the libraries and Models

import pandas as pd #numeric calculations
import numpy as np #numeric calculations
import matplotlib.pyplot as plt #visualization
import seaborn as sns #visualization
import warnings
warnings.filterwarnings("ignore")
import torch #deep learning
import re #regular expressions

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoTokenizer , DistilBertModel
#DistilBertTokenizer converting raw text into a format the model can understand.  #DistilBertForSequenceClassification is a pre-trained DistilBERT model specifically designed for text classification tasks.
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score , roc_auc_score
from sklearn.metrics import classification_report



In [None]:
#loading dataset
df = pd.read_excel(r"C:\Users\aarat\Desktop\Bvoc IT\sem6\proj_s6\main\emscad_cleaned_excel.xlsx")

DATA CLEANING & PREPROCESSING

In [None]:
#analysing the dataset

df.info() #to get the datatype of the data's

In [None]:
df.head()


In [None]:
df.columns #to get the column names in dataset

In [None]:
print(df)

In [None]:
df["fraudulent"].value_counts() #to get the fraudulent value counts 0 - real , 1-fake

In [None]:
df.isnull().sum() #null values in dataset

In [None]:
df.isnull().sum().sum()

In [None]:
print(df.shape)
print(df.columns)
#here print used for printing the output of both lines of code

In [None]:
#filling the null values
text_cols = ["description", "benefits", "city", "requirements"]
df[text_cols] = df[text_cols].fillna("unknown")

In [None]:
#joining the contexts for easy identification for ml model

text_join = ["title","description","company_profile","requirements","benefits"]
df['text']= df[text_join].agg(' '.join,axis =1)
df['text']

In [None]:
#count plot graph for showing Real & fake values #?label it
plt.figure(figsize=(6,4))
sns.countplot(x = df["fraudulent"],palette=["green","red"])
plt.title("Real vs fake")
plt.xlabel("fraudulent")
plt.ylabel("text")
plt.show()

EXPLORATORY DATA ANALYSIS

In [None]:
#Exploratory Analysis

#to create a correlation matrix to study the relationship btw numeric data
numeric_df = (df.select_dtypes(include=('Int64','Float64'))).corr()#computing correlation
print(numeric_df)

#plotting the correlation matrix in heat map
plt.figure(figsize=(6,4))
sns.heatmap(numeric_df,cmap="viridis",annot=True,fmt=".2g")
plt.show()
#


In [None]:
#iqr method for outlier detection
df["text_length"] = df["description"].apply(lambda x: len(str(x).split()))
q1 = df["text_length"].quantile(0.25)
q3 = df["text_length"].quantile(0.75)
iqr = q3-q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = df[(df["text_length"] < lower_bound) | (df["text_length"] > upper_bound)]
print("Number of outliers detected:", outliers.shape[0])

plt.figure(figsize=(10,6))
plt.hist(df["text_length"], bins=50 , color = 'yellowgreen', edgecolor = 'black' )
plt.axvline(lower_bound, linestyle='--', label='Lower IQR Bound', color = 'violet')
plt.axvline(upper_bound, linestyle='--', label='Upper IQR Bound', color = 'orange')
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.title("IQR-based Outlier Detection using Histogram")
plt.legend()
plt.show()

In [None]:
us_df = df[df["country"]=="US"] #filtering only US states
us_df =us_df[us_df["state"]!='""']
print(us_df["state"].unique())

In [None]:
#bar chart plots that in which state in us produces the greatest number of jobs

state_count = us_df["state"].value_counts()
state_count = state_count[state_count.index != '""'] #removing the null values from state column
plt.figure(figsize=(18,6))
sns.barplot(x=state_count.index, y=state_count.values ,palette="plasma",)
plt.title("Top U.S. States by Number of Job Postings")
plt.xlabel("U.S. State")
plt.ylabel("Number of Jobs")
plt.show()

In [None]:
#In which country has most fake jobs
fake_df = df[df["fraudulent"]==1]
fake_df=fake_df[fake_df["country"] != '""']
country_fake = fake_df["country"].value_counts()
print(country_fake)
#converting to df
country_fake_df = country_fake.to_frame(name="Fake jobs")
top10_countries = country_fake_df.head(10)

#using bar chart for plotting it

plt.figure(figsize=(6,4))
sns.barplot(x=top10_countries.index , y =top10_countries["Fake jobs"] ,palette="magma")

#this loop is used for representing the value of each bar.
for i, value in enumerate(top10_countries["Fake jobs"]): #i-position of bar , value - no.of fake jobs
    plt.text(i, value, str(value), ha="center", va="bottom") 

plt.title("Most fake postings in country ")
plt.xlabel("No.of Fake jobs")
plt.show()
#percentage case

In [None]:
#word cloud visualization
from wordcloud import WordCloud
fake_jobs = " ".join(df[df["fraudulent"]==1]['description'].astype(str))

plt.figure(figsize=(10,6))
wc = WordCloud(width = 800 , height = 400).generate(fake_jobs)
plt.axis("off")
plt.imshow(wc)
plt.title("Word Cloud for fake job descriptions")
plt.show()

In [None]:
#plots to see the distribution of continous features indivdually

plt.figure(figsize=(25,18))

df = df[df["employment_type"] != '""']
plt.subplot(3,3,1)
sns.histplot(df["employment_type"],color="purple")
plt.title("Employment Type Graph")
plt.xlabel("employement")
plt.xticks(rotation=90)

df = df[df["required_experience"] != '""']
plt.subplot(3,3,2)
sns.histplot(df["required_experience"],color="blue")
plt.title("Required Experience Graph")
plt.xlabel("Required Experience")
plt.xticks(rotation=90)

df = df[df["required_education"] != '""']
plt.subplot(3,3,3)
sns.histplot(df["required_education"],color="green")
plt.title("Required Education Graph")
plt.xlabel("Education")
plt.xticks(rotation=90)

plt.show()

In [None]:
for state in df["state"]:
    print(state)

#print(df["state"])

In [None]:
for country in df["country"]:
    print(country)

#print(df["country"])

In [None]:
df.drop(columns = ["title","description","company_profile","requirements","benefits"], inplace = True)

FEATURE ENGINEERING AND SELECTION

In [None]:
#bert
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# model = DistilBertModel.from_pretrained("distilbert-base-uncased")
# warnings.filterwarnings("ignore")
# model.eval()



In [None]:
#batch_size = 32
#all_embeddings = []

#model.eval()

#for i in range(0, len(df), batch_size):
#    batch_text = df["text"].iloc[i:i+batch_size].tolist()
#
#   tokenization = tokenizer(
#        batch_text,
#        padding=True,
#        truncation=True,
#        max_length=128,
#        return_tensors="pt"
#    )

 #   with torch.no_grad():
  #     output = model(**tokenization)

  #  batch_embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()
  #  all_embeddings.append(batch_embeddings)

#X = np.vstack(all_embeddings)


In [None]:

# embeddings = output.last_hidden_state[:,0,:].numpy()
#X = embeddings
#y = df["fraudulent"]
#X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42) 



In [None]:
#balancing the dataset using smote method

from imblearn.over_sampling import SMOTE
smote_data = SMOTE(random_state=42)
x_imbal , y_imbal = smote_data.fit_resample(X_train_tfidf,y_train)
print("/n befor smote : ",y_train.value_counts())
print("/n after smote : ",y_imbal.value_counts())

MODEL SELECTION AND TRAINING

In [None]:
#FITTING MODELS and training dataset

#LOGISITIC REGRESSION,
#For for Binary classification:0 = Real job, 1 = Fake job,
#Studies the association btw categorical and dependent variable and set of independent variable
logreg = LogisticRegression(max_iter=1000 , random_state=42)
logreg.fit(X_train_tfidf , y_train)


In [None]:
#NAIVE BAYES
#Used to classify job posts as Fake or Real based on text features (TF-IDF). 
#also for binary classification (0/1)
mnb = MultinomialNB()
mnb.fit(X_train_tfidf,y_train)


In [None]:

#SUPPORT VECTOR MACHINE
#Used to separate fake and real jobs with maximum margin
#Best with TF-IDF text features
sup_vec = LinearSVC(max_iter=5000, random_state=42, dual=False)
sup_vec.fit(X_train_tfidf,y_train)

In [None]:
#RANDOM FOREST CLASSIFIER
#Random Forest is used to classify job postings as Fake or Real by learning patterns from multiple features.
#It takes prediction from each decision tree and based on the majority votes.
rfc = RandomForestClassifier(n_estimators=200, random_state=42 , n_jobs=-1)
rfc.fit(X_train_tfidf,y_train)

MODEL EVALUATION AND PREDICTION

In [None]:
#prediction and evaluation

models = { "logistic regression " : logreg , "Naive Bayes" : mnb, "Support Vector Machine" : sup_vec , "RandomForestClassifier": rfc }

y_pred_logreg = logreg.predict(X_test_tfidf)
y_pred_mnb = mnb.predict(X_test_tfidf)
y_pred_supvec = sup_vec.predict(X_test_tfidf)
y_pred_rfc = rfc.predict(X_test_tfidf)

print("Logistic Regression Evaluation")
print(classification_report(y_test,y_pred_logreg))

print("Naive Bayes Evaluation")
print(classification_report(y_test,y_pred_mnb))

print("Support Vector Machine")
print(classification_report(y_test,y_pred_supvec))

print("RandomForestClassifier")
print(classification_report(y_test,y_pred_rfc))