In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from numpy import where
import pandas as pd
import re
import seaborn as sns

from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

# Setup

In [None]:
df = pd.read_csv("fake_job_postings.csv",index_col='job_id')

Because we've identified the False Negative to be the errors with the highest cost and because we'd rather use a ROC_AUC score (as well as the ROC curve which has the benefit to help us fine tune the most efficient model), we've decided to swap the values in the 'fraudulent' column. This means the model will now predict if a job post is legitimate and it will allow us to minimise the False positive (when an offer is flagged as legit by the model but actually is of fraudulent nature):

In [None]:
df['fraudulent'].replace([0,1], [1,0], inplace=True)

## Addressing the missing information

We have decided to address the missing information in the different columns by replacing the lack of information by `NA`. Instead of using the 'Other' or 'Unspecified' values available in certain columns, this will allow us to quickly spot when the information was not provided:
01. title:               No missing data
02. location:            NA
03. department:          NA
04. salary_range:        NA
05. company_profile:     NA
06. description:         NA
07. requirements:        NA
08. benefits:            NA
09. telecommuting:       No missing data
10. has_company_logo:    No missing data
11. has_questions:       No missing data
12. employment_type:     NA
13. required_experience: NA
14. required_education:  NA
15. industry:            NA
16. function:            NA
17. fraudulent:          No missing data

In [None]:
for column in df.columns:
    df[column].fillna('NA', inplace=True)

 Some job offers have contact details or external url. Could this be linked to fraudulent activity? Could this improve our model's performance?

In [None]:
# turn into a function: use dict (columnName:textToSearch, list(dictinput.keys())[0]:list(dictinput.values())[0])
df = df.assign(hasEMAIL=0, hasPHONE=0, hasURL=0)
for column in df.columns[4:8]:
    for i in range(1,len(df[column])):
        if df[column][i].find('#URL_')!=-1:
            df['hasURL'][i] = 1
        elif df[column][i].find('#PHONE_')!=-1:
            df['hasPHONE'][i] = 1
        elif df[column][i].find('#EMAIL_')!=-1:
            df['hasEMAIL'][i] = 1

Let's see if any clear pattern can already be identified:

In [None]:
[df[column].value_counts() for column in df.columns]

In [None]:
sns.heatmap(df.drop(columns=['fraudulent']).corr(), annot=True);

No multicollinearity issues with those binary variables

## Text cleaning and pre-processing

In [None]:
for column in df.columns[4:8]:
    for i in range(1,len(df[column])):
        df[column][i] = re.sub(r'([A-Z][a-z])', r' \1', df[column][i])

# Train Test Split

In [None]:
labels = df['fraudulent']
features = df.drop(columns=['fraudulent'])

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                    labels,
                                                    random_state=666)

# Initialising K-Folds
kfold = KFold(n_splits=5,
              random_state=666,
              shuffle=True
             )

# TF-IDF computation

In [None]:
vect_comp = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')
vect_desc = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')
vect_req = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')
vect_ben = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')

In [None]:
features_train_vectorized_c = vect_comp.fit_transform([document for document in features_train['company_profile']])
features_test_vectorized_c = vect_comp.transform([document for document in features_test['company_profile']])

features_train_vectorized_cd = vect_desc.fit_transform([document for document in features_train['description']])
features_test_vectorized_cd = vect_desc.transform([document for document in features_test['description']])

features_train_vectorized_cdr = vect_req.fit_transform([document for document in features_train['requirements']])
features_test_vectorized_cdr = vect_req.transform([document for document in features_test['requirements']])

features_train_vectorized_cdrb = vect_ben.fit_transform([document for document in features_train['benefits']])
features_test_vectorized_cdrb = vect_ben.transform([document for document in features_test['benefits']])

## Shape check

In [None]:
print('company_profile in train set:', len([document for document in features_train['company_profile']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_c.todense().shape)
print('-'*20)
print('company_profile in test set:', len([document for document in features_test['company_profile']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_c.todense().shape)
print('-'*40)
print('description in train set:', len([document for document in features_train['description']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_cd.todense().shape)
print('-'*20)
print('description in test set:', len([document for document in features_test['description']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_cd.todense().shape)
print('-'*40)
print('requirements in train set:', len([document for document in features_train['requirements']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_cdr.todense().shape)
print('-'*20)
print('requirements in test set:', len([document for document in features_test['requirements']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_cdr.todense().shape)
print('-'*40)
print('benefits in train set:', len([document for document in features_train['benefits']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_cdrb.todense().shape)
print('-'*20)
print('benefits in test set:', len([document for document in features_test['benefits']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_cdrb.todense().shape)

# Addressing the class imbalance

As class imbalance will only matter during the training step, SMOTE will only be applied to to train dataset.

In [None]:
# SMOTE doesn't work on text, it needs to be changed to TF-IDF
sm = SMOTE(random_state=666, n_jobs=4)

In [None]:
X, y = sm.fit_resample(features_train_vectorized_cdrb, labels_train)

## Shape check

In [None]:
print('X', X.shape)
print('y', y.shape)
print('labels_train', labels_train.value_counts())
print('y.value_counts', y.value_counts())

# Gaussian Naive Bayes

In [None]:
gnb = GaussianNB()

In [None]:
gnb.fit(X.todense(), y)

In [None]:
labels_train_pred = gnb.predict_proba(X.todense())

In [None]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())

# Before SMOTE
# Accuracy (on train set): 0.5287859202545491
# Accuracy (on validation set): 0.5144646585147629

In [None]:
fpr, tpr, thresholds = roc_curve(y, [labels_train_pred[x][1] for x in range(len(labels_train_pred))])
print('AUC train dataset: {}'.format(auc(fpr, tpr)))

Massive overfit!

# Multinomial Naive Bayes

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(X.todense(), y)

In [None]:
labels_train_pred = mnb.predict(X.todense())

In [None]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())

Massive overfit!

# Random Forest

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
rfc.fit(X.todense(), y)

In [None]:
labels_train_pred = rfc.predict(X.todense())

In [None]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())

Massive overfit!

# Finialising the selected model

## Confusion matrix

In [None]:
y_hat_test = rfc.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_hat_test)

# Create the basic matrix
plt.imshow(cnf_matrix,  cmap=plt.cm.Blues) 

plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')

# Add appropriate axis scales
class_names = set(y) # Get class labels to add to matrix
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation=45)
plt.yticks(tick_marks, class_names)

# Add labels to each cell
thresh = cnf_matrix.max() / 2. # Used for text coloring below
# Here we iterate through the confusion matrix and append labels to our visualization 
for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, cnf_matrix[i, j],
                 horizontalalignment='center',
                 color='white' if cnf_matrix[i, j] > thresh else 'black')

# Add a legend
plt.colorbar()
plt.show()

In [None]:
cm = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
def conf_matrix(self,y_test, y_hat_test):
        for ind, label in enumerate(y_test):
            pred = y_hat_test[ind]
            if label == 1: 
                if label == pred:
                    cm['TP'] += 1
                else:
                    cm['FN'] += 1
            else:
                if label == pred:
                    cm['TN'] += 1
                else:
                    cm['FP'] += 1
            self.cm_values = cm
        return cm

In [None]:
conf_matrix(rfc,y_test,y_hat_test)

## Receiver Operating Characteristic

In [None]:
def buildROC(target_train, train_preds, target_test,test_preds):
    fpr, tpr, threshold = metrics.roc_curve(target_test, test_preds)
    roc_auc = metrics.auc(fpr, tpr)
    fpr1, tpr1, threshold = metrics.roc_curve(target_train, train_preds)
    roc_auc1 = metrics.auc(fpr1, tpr1)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr1, tpr1, 'b', label = 'Train AUC = %0.2f' % roc_auc1)
    plt.plot(fpr, tpr, 'b', label = 'Validation AUC = %0.2f' % roc_auc, color = 'g')
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.gcf().savefig('roc.png')

In [None]:
training_preds = .predict(X_train)
test_preds = .predict(X_test)

training_roc_auc = roc_auc_score(y_train, training_preds)
test_roc_auc = roc_auc_score(y_test, test_preds)

print('Training ROC_AUC: {:.4}%'.format(training_roc_auc * 100))
print('Validation ROC_AUC: {:.4}%'.format(test_roc_auc * 100))

### ROC plot

In [None]:
# Plotting the ROC curve
buildROC(y_train, training_preds, y_validate, validate_preds)

In [None]:
def annot(fpr,tpr,thr):
    k=0
    for i,j in zip(fpr,tpr):
        if k %75 == 0:
            plt.annotate(round(thr[k],2),xy=(i,j), textcoords='data')
        k+=1

In [None]:
for data in [[y_train, y_train_score[:,1]],[y_validate, y_validate_score[:,1]],[y_test, y_test_score[:,1]]]:
    fpr, tpr, threshold = roc_curve(data[0], data[1])
    plt.plot(fpr, tpr)
annot(fpr, tpr, threshold)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.ylabel('TPR (power)')
plt.xlabel('FPR (alpha)')
plt.legend(['train','validation','test'])
plt.show()

### Threshold selection

In [None]:
threshold_chosen = 0
difference = 0
for i in range(len(thresholds)):
    temp = tpr[i]-fpr[i]
    if temp>difference:
        difference=temp
        threshold_chosen=thresholds[i]
threshold_chosen = round(threshold_chosen,2)
print('Best Threshold: ',threshold_chosen)

## Cost Function

In [None]:
#The cost function terms are wrong, double check those!

# Focusing on a stat that improves your game
cTP = 100
# Not focusing on a stat that doesn't improve your game
cTN = 1
# Focusing on a stat that doesn't improve your game
cFP = 100
# Not focusing on a stat that improves your game
cFN = 20

prevalence = (cm['TP']+cm['FN']) / (cm['TP']+cm['FP']+cm['TN']+cm['FN'])

# Metz coefficient
_m = ((1-prevalence)/prevalence) * ((cFP-cTN) / (cFN-cTP))

# Using the Zweig & Campbell equation:
function_m = cm['TP'] -_m*cm['FP']
function_m