In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

#Import dataset
dataset = pd.read_csv("fake_job_postings.csv")

#Chose class to predict
Class = "fraudulent"

#Fill empty rows with unspecified
df = dataset['requirements'].fillna('Unspecified')

#Combine datasets
dataset = pd.merge(dataset,df)

#Remove any row that has an empty column
dataset.dropna(axis='index', inplace=True)

#Remove select columns
dataset = dataset.drop(columns=['salary_range','company_profile','description','benefits','department','job_id'])

#Encode all strings into numbers
dataset = pd.DataFrame(preprocessing.OrdinalEncoder().fit_transform(dataset), columns=dataset.columns)

#Specify x and y data
OldX = dataset.drop(Class, axis = 1)
Oldy = dataset[Class]

#Oversample to get similar amounts in x and y
print('Original dataset shape %s' % Counter(Oldy))
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(OldX, Oldy)
print('Resampled dataset shape %s' % Counter(y))

print(dataset.head(5))

#Old Oversampling technique
#print(Oldy.value_counts)
#ros = RandomOverSampler()
#X,y = ros.fit_resample(OldX,Oldy)
#print(y.value_counts)

# normalize data
# for balanced dataset
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

set_of_classes = y.value_counts().index.tolist()
set_of_classes= pd.DataFrame({Class: set_of_classes})

# for original dataset
scaler_og = MinMaxScaler(feature_range=(0, 1))
X_rescaled_og = scaler.fit_transform(OldX)
OldX = pd.DataFrame(data = X_rescaled_og, columns = OldX.columns)

set_of_classes = y.value_counts().index.tolist()
set_of_classes= pd.DataFrame({Class: set_of_classes})


print("Pre-processed data :")
print(X)

print("Pre-processed class :")
print(y)

Original dataset shape Counter({0.0: 1671, 1.0: 236})
Resampled dataset shape Counter({0.0: 1671, 1.0: 1671})
   title  location  requirements  telecommuting  has_company_logo  \
0  255.0      10.0         613.0            0.0               1.0   
1  596.0      91.0         316.0            0.0               1.0   
2  599.0     110.0         304.0            0.0               1.0   
3  269.0     100.0         414.0            0.0               1.0   
4  346.0      43.0         183.0            1.0               1.0   

   has_questions  employment_type  required_experience  required_education  \
0            1.0              1.0                  5.0                 4.0   
1            1.0              1.0                  3.0                 1.0   
2            0.0              1.0                  3.0                 7.0   
3            1.0              1.0                  5.0                 3.0   
4            0.0              1.0                  2.0                 1.0   

   ind

In [2]:
#splitting data into ratio 80:20

# Original imbalanced dataset
data_train_og, data_test_og, class_train_og, class_test_og = train_test_split(OldX, Oldy, test_size=0.2)


# Dataset with SMOTE (generated minority to create balanaced set)
data_train_s, data_test_s, class_train_s, class_test_s = train_test_split(X, y, test_size=0.2)

In [3]:
# build model based on linear
from sklearn.svm import SVC


# Build model with linear using original imbalanced dataset
svc_li_og = SVC(kernel='linear')
svc_li_og.fit(data_train_og, np.asarray(class_train_og))

# build model with linear using balanced dataset (from SMOTE)
svc_li_s = SVC(kernel='linear')
svc_li_s.fit(data_train_s, np.asarray(class_train_s))

In [4]:
# Build model with sigmoid using original imbalanced dataset
svc_sigmoid_og = SVC(kernel='sigmoid')
svc_sigmoid_og.fit(data_train_og, np.asarray(class_train_og))

# build model with sigmoid using balanced dataset (from SMOTE)
svc_sigmoid_s = SVC(kernel='sigmoid')
svc_sigmoid_s.fit(data_train_s, np.asarray(class_train_s))

In [5]:
# Build model with rbf using original imbalanced dataset
svc_rbf_og = SVC(kernel='rbf')
svc_rbf_og.fit(data_train_og, np.asarray(class_train_og))

# build model with rbf using balanced dataset (from SMOTE)
svc_rbf_s = SVC(kernel='rbf')
svc_rbf_s.fit(data_train_s, np.asarray(class_train_s))

In [6]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score


########### IMBALANCED DATASET (ORIGINAL) ###########
# predict linear model
pred_og = svc_li_og.predict(data_test_og)

#Ways to report how good the model was
print ("########## IMBALANCED DATASET (ORIGINAL) ##########")
print("Accuracy : ", accuracy_score(class_test_og, pred_og))
print("Mean Square Error : ", mean_squared_error(class_test_og, pred_og))

print(pred_og[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test_og, pred_og))

print("Classification Report (Linear kernel): ")
print(classification_report(class_test_og, pred_og,zero_division=1))


########### BALANCED DATASET (SMOTE) ###########
pred_s = svc_li_s.predict(data_test_s)
#Ways to report how good the model was
print ("########## BALANCED DATASET (SMOTE) ##########")
print("Accuracy : ", accuracy_score(class_test_s, pred_s))
print("Mean Square Error : ", mean_squared_error(class_test_s, pred_s))

print(pred_s[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test_s, pred_s))

print("Classification Report (Linear kernel): ")
print(classification_report(class_test_s, pred_s,zero_division=1))

########## IMBALANCED DATASET (ORIGINAL) ##########
Accuracy :  0.9267015706806283
Mean Square Error :  0.07329842931937172
[0. 0. 0. 0. 0.]
Confusion Matrix for each label : 
[[[ 26  21]
  [  7 328]]

 [[328   7]
  [ 21  26]]]
Classification Report (Linear kernel): 
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96       335
         1.0       0.79      0.55      0.65        47

    accuracy                           0.93       382
   macro avg       0.86      0.77      0.80       382
weighted avg       0.92      0.93      0.92       382

########## BALANCED DATASET (SMOTE) ##########
Accuracy :  0.8385650224215246
Mean Square Error :  0.16143497757847533
[0. 1. 1. 0. 0.]
Confusion Matrix for each label : 
[[[282  71]
  [ 37 279]]

 [[279  37]
  [ 71 282]]]
Classification Report (Linear kernel): 
              precision    recall  f1-score   support

         0.0       0.80      0.88      0.84       316
         1.0       0.88      0.80

In [7]:
# predict rbf model
########### IMBALANCED DATASET (ORIGINAL) ###########
# predict linear model
pred_og = svc_rbf_og.predict(data_test_og)

#Ways to report how good the model was
print ("########## IMBALANCED DATASET (ORIGINAL) ##########")
print("Accuracy : ", accuracy_score(class_test_og, pred_og))
print("Mean Square Error : ", mean_squared_error(class_test_og, pred_og))

print(pred_og[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test_og, pred_og))

print("Classification Report (RBF kernel): ")
print(classification_report(class_test_og, pred_og,zero_division=1))


########### BALANCED DATASET (SMOTE) ###########
pred_s = svc_rbf_s.predict(data_test_s)
#Ways to report how good the model was
print ("########## BALANCED DATASET (SMOTE) ##########")
print("Accuracy : ", accuracy_score(class_test_s, pred_s))
print("Mean Square Error : ", mean_squared_error(class_test_s, pred_s))

print(pred_s[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test_s, pred_s))

print("Classification Report (RBF kernel): ")
print(classification_report(class_test_s, pred_s,zero_division=1))

########## IMBALANCED DATASET (ORIGINAL) ##########
Accuracy :  0.9450261780104712
Mean Square Error :  0.0549738219895288
[0. 0. 0. 0. 0.]
Confusion Matrix for each label : 
[[[ 28  19]
  [  2 333]]

 [[333   2]
  [ 19  28]]]
Classification Report (RBF kernel): 
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97       335
         1.0       0.93      0.60      0.73        47

    accuracy                           0.95       382
   macro avg       0.94      0.79      0.85       382
weighted avg       0.94      0.95      0.94       382

########## BALANCED DATASET (SMOTE) ##########
Accuracy :  0.968609865470852
Mean Square Error :  0.03139013452914798
[0. 1. 1. 0. 0.]
Confusion Matrix for each label : 
[[[342  11]
  [ 10 306]]

 [[306  10]
  [ 11 342]]]
Classification Report (RBF kernel): 
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97       316
         1.0       0.97      0.97      0.

In [9]:
# predict rbf model
########### IMBALANCED DATASET (ORIGINAL) ###########
# predict linear model
pred_og = svc_sigmoid_og.predict(data_test_og)

#Ways to report how good the model was
print ("########## IMBALANCED DATASET (ORIGINAL) ##########")
print("Accuracy : ", accuracy_score(class_test_og, pred_og))
print("Mean Square Error : ", mean_squared_error(class_test_og, pred_og))

print(pred_og[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test_og, pred_og))

print("Classification Report (Sigmoid kernel): ")
print(classification_report(class_test_og, pred_og,zero_division=1))


########### BALANCED DATASET (SMOTE) ###########
pred_s = svc_sigmoid_s.predict(data_test_s)
#Ways to report how good the model was
print ("########## BALANCED DATASET (SMOTE) ##########")
print("Accuracy : ", accuracy_score(class_test_s, pred_s))
print("Mean Square Error : ", mean_squared_error(class_test_s, pred_s))

print(pred_s[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test_s, pred_s))

print("Classification Report (Sigmoid kernel): ")
print(classification_report(class_test_s, pred_s,zero_division=1))

########## IMBALANCED DATASET (ORIGINAL) ##########
Accuracy :  0.8769633507853403
Mean Square Error :  0.12303664921465969
[0. 0. 0. 0. 0.]
Confusion Matrix for each label : 
[[[ 15  32]
  [ 15 320]]

 [[320  15]
  [ 32  15]]]
Classification Report (Sigmoid kernel): 
              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93       335
         1.0       0.50      0.32      0.39        47

    accuracy                           0.88       382
   macro avg       0.70      0.64      0.66       382
weighted avg       0.86      0.88      0.86       382

########## BALANCED DATASET (SMOTE) ##########
Accuracy :  0.7159940209267563
Mean Square Error :  0.28400597907324365
[0. 1. 0. 0. 0.]
Confusion Matrix for each label : 
[[[233 120]
  [ 70 246]]

 [[246  70]
  [120 233]]]
Classification Report (Sigmoid kernel): 
              precision    recall  f1-score   support

         0.0       0.67      0.78      0.72       316
         1.0       0.77      0.