In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

#Import dataset
dataset = pd.read_csv("fake_job_postings.csv")

#Chose class to predict
Class = "fraudulent"

#Fill empty rows with unspecified
df = dataset['requirements'].fillna('Unspecified')

#Combine datasets
dataset = pd.merge(dataset,df)

#Remove any row that has an empty column
dataset.dropna(axis='index', inplace=True)

#Remove select columns
dataset = dataset.drop(columns=['salary_range','company_profile','description','benefits','department','job_id'])

#Encode all strings into numbers
dataset = pd.DataFrame(preprocessing.OrdinalEncoder().fit_transform(dataset), columns=dataset.columns)

#Specify x and y data
OldX = dataset.drop(Class, axis = 1)
Oldy = dataset[Class]

#Oversample to get similar amounts in x and y
print('Original dataset shape %s' % Counter(Oldy))
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(OldX, Oldy)
print('Resampled dataset shape %s' % Counter(y))

print(dataset.head(5))

#Old Oversampling technique
#print(Oldy.value_counts)
#ros = RandomOverSampler()
#X,y = ros.fit_resample(OldX,Oldy)
#print(y.value_counts)

# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

set_of_classes = y.value_counts().index.tolist()
set_of_classes= pd.DataFrame({Class: set_of_classes})


print("Pre-processed data :")
print(X)

print("Pre-processed class :")
print(y)

Original dataset shape Counter({0.0: 1671, 1.0: 236})
Resampled dataset shape Counter({0.0: 1671, 1.0: 1671})
   title  location  requirements  telecommuting  has_company_logo  \
0  255.0      10.0         613.0            0.0               1.0   
1  596.0      91.0         316.0            0.0               1.0   
2  599.0     110.0         304.0            0.0               1.0   
3  269.0     100.0         414.0            0.0               1.0   
4  346.0      43.0         183.0            1.0               1.0   

   has_questions  employment_type  required_experience  required_education  \
0            1.0              1.0                  5.0                 4.0   
1            1.0              1.0                  3.0                 1.0   
2            0.0              1.0                  3.0                 7.0   
3            1.0              1.0                  5.0                 3.0   
4            0.0              1.0                  2.0                 1.0   

   ind

In [2]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import classification_report

cat = ['title', 'location', 'requirements', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry']

# splitting data into ratio 80:20
data_train, data_test, class_train, class_test = train_test_split(X, y, test_size=0.2)

print(data_train)

# Build model for catagorical data 
clf = CategoricalNB(force_alpha=True)
clf.fit(data_train, class_train)



         title  location  requirements  telecommuting  has_company_logo  \
1347  0.444625  0.949074      0.381250            0.0               1.0   
650   1.000000  0.745370      0.551563            1.0               1.0   
1700  0.744300  0.189815      0.781250            0.0               1.0   
2115  0.121868  0.874820      0.423830            0.0               1.0   
785   0.224756  0.472222      0.450000            1.0               1.0   
...        ...       ...           ...            ...               ...   
3147  0.955110  0.492765      0.582961            0.0               1.0   
1940  0.127036  0.853863      0.037500            0.0               1.0   
3308  0.952773  0.495357      0.629273            0.0               1.0   
367   0.208469  0.967593      0.004688            0.0               0.0   
354   0.208469  0.967593      0.004688            0.0               0.0   

      has_questions  employment_type  required_experience  required_education  \
1347            1.

In [3]:
# build model based on linear
from sklearn.svm import SVC
svc_li = SVC(kernel='linear')

svc_li.fit(data_train, np.asarray(class_train))

In [4]:
# build model based on rbf 
svc_rbf = SVC(kernel='rbf')

svc_rbf.fit(data_train, np.asarray(class_train))

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
# predict linear model
pred = svc_li.predict(data_test)

#Ways to report how good the model was
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

print(pred[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred,zero_division=1))

Accuracy :  0.8579970104633782
Mean Square Error :  0.14200298953662183
[0. 0. 1. 0. 0.]
Confusion Matrix for each label : 
[[[288  62]
  [ 33 286]]

 [[286  33]
  [ 62 288]]]
Classification Report : 
              precision    recall  f1-score   support

         0.0       0.82      0.90      0.86       319
         1.0       0.90      0.82      0.86       350

    accuracy                           0.86       669
   macro avg       0.86      0.86      0.86       669
weighted avg       0.86      0.86      0.86       669



In [9]:
# predict rbf model
pred = svc_rbf.predict(data_test)

#Ways to report how good the model was
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

print(pred[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred,zero_division=1))

Accuracy :  0.9671150971599403
Mean Square Error :  0.03288490284005979
[0. 1. 1. 0. 1.]
Confusion Matrix for each label : 
[[[342   8]
  [ 14 305]]

 [[305  14]
  [  8 342]]]
Classification Report : 
              precision    recall  f1-score   support

         0.0       0.97      0.96      0.97       319
         1.0       0.96      0.98      0.97       350

    accuracy                           0.97       669
   macro avg       0.97      0.97      0.97       669
weighted avg       0.97      0.97      0.97       669

