In [1]:
import pandas as pd
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from imblearn.over_sampling import SMOTE
from numpy import where

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

Using TensorFlow backend.


# Setup

In [None]:
df = pd.read_csv("fake_job_postings.csv",index_col='job_id')

Because we've identified the False Negative to be the errors with the highest cost and because we'd rather use a ROC_AUC score (as well as the ROC curve which has the benefit to help us fine tune the most efficient model), we've decided to swap the values in the 'fraudulent' column. This means the model will now predict if a job post is legitimate and it will allow us to minimise the False positive (when an offer is flagged as legit by the model but actually is of fraudulent nature):

In [None]:
df['fraudulent'].replace([0,1], [1,0], inplace=True)

## Addressing the missing information

We have decided to address the missing information in the different columns by replacing the lack of information by `NA`. Instead of using the 'Other' or 'Unspecified' values available in certain columns, this will allow us to quickly spot when the information was not provided:
01. title:               No missing data
02. location:            NA
03. department:          NA
04. salary_range:        NA
05. company_profile:     NA
06. description:         NA
07. requirements:        NA
08. benefits:            NA
09. telecommuting:       No missing data
10. has_company_logo:    No missing data
11. has_questions:       No missing data
12. employment_type:     NA
13. required_experience: NA
14. required_education:  NA
15. industry:            NA
16. function:            NA
17. fraudulent:          No missing data

In [None]:
for column in df.columns:
    df[column].fillna('NA', inplace=True)

 Some job offers have contact details or external url. Could this be linked to fraudulent activity? Could this improve our model's performance?

In [None]:
# turn into a function: use dict (columnName:textToSearch, list(dictinput.keys())[0]:list(dictinput.values())[0])
df = df.assign(hasEMAIL=0, hasPHONE=0, hasURL=0)
for column in df.columns[4:8]:
    for i in range(1,len(df[column])):
        if df[column][i].find('#URL_')!=-1:
            df['hasURL'][i] = 1
        elif df[column][i].find('#PHONE_')!=-1:
            df['hasPHONE'][i] = 1
        elif df[column][i].find('#EMAIL_')!=-1:
            df['hasEMAIL'][i] = 1

Let's see if any clear pattern can already be identified:

In [None]:
[df[column].value_counts() for column in df.columns]

In [None]:
sns.heatmap(df.drop(columns=['fraudulent']).corr(), annot=True);

No multicollinearity issues with those binary variables

## Text cleaning and pre-processing

In [None]:
for column in df.columns[4:8]:
    for i in range(1,len(df[column])):
        df[column][i] = re.sub(r'([A-Z][a-z])', r' \1', df[column][i])

# Train Test Split

In [None]:
labels = df['fraudulent']
features = df.drop(columns=['fraudulent'])

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                    labels,
                                                    random_state=666)

# Initialising K-Folds
kfold = KFold(n_splits=5,
              random_state=666,
              shuffle=True
             )

# TF-IDF computation

In [None]:
vect_comp = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')
vect_desc = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')
vect_req = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')
vect_ben = TfidfVectorizer(input='content', strip_accents='unicode', token_pattern=r'\w+', analyzer='word', stop_words='english')

In [None]:
features_train_vectorized_c = vect_comp.fit_transform([document for document in features_train['company_profile']])
features_test_vectorized_c = vect_comp.transform([document for document in features_test['company_profile']])

features_train_vectorized_cd = vect_desc.fit_transform([document for document in features_train['description']])
features_test_vectorized_cd = vect_desc.transform([document for document in features_test['description']])

features_train_vectorized_cdr = vect_req.fit_transform([document for document in features_train['requirements']])
features_test_vectorized_cdr = vect_req.transform([document for document in features_test['requirements']])

features_train_vectorized_cdrb = vect_ben.fit_transform([document for document in features_train['benefits']])
features_test_vectorized_cdrb = vect_ben.transform([document for document in features_test['benefits']])

## Shape check

In [None]:
print('company_profile in train set:', len([document for document in features_train['company_profile']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_c.todense().shape)
print('-'*20)
print('company_profile in test set:', len([document for document in features_test['company_profile']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_c.todense().shape)
print('-'*40)
print('description in train set:', len([document for document in features_train['description']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_cd.todense().shape)
print('-'*20)
print('description in test set:', len([document for document in features_test['description']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_cd.todense().shape)
print('-'*40)
print('requirements in train set:', len([document for document in features_train['requirements']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_cdr.todense().shape)
print('-'*20)
print('requirements in test set:', len([document for document in features_test['requirements']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_cdr.todense().shape)
print('-'*40)
print('benefits in train set:', len([document for document in features_train['benefits']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized_cdrb.todense().shape)
print('-'*20)
print('benefits in test set:', len([document for document in features_test['benefits']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized_cdrb.todense().shape)

# Addressing the class imbalance

As class imbalance will only matter during the training step, SMOTE will only be applied to to train dataset.

In [None]:
# SMOTE doesn't work on text, it needs to be changed to TF-IDF
sm = SMOTE(random_state=666, n_jobs=4)

In [None]:
X, y = sm.fit_resample(features_train_vectorized_cdrb, labels_train)

## Shape check

In [None]:
X.shape # (19184, 9915)
y.shape # (19184,)
labels_train.value_counts() # 9592/465
y.value_counts() # 9592/9592

# Gaussian Naive Bayes

In [2]:
gnb = GaussianNB()

In [None]:
gnb.fit(X.todense(), labels_train)

In [None]:
labels_train_pred = gnb.predict(X.todense())

In [None]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())

# Before SMOTE
# Accuracy (on train set): 0.5287859202545491
# Accuracy (on validation set): 0.5144646585147629

Massive overfit!

# Multinomial Naive Bayes

In [3]:
mnb = MultinomialNB()

In [None]:
mnb.fit(X.todense(), labels_train)

In [None]:
labels_train_pred = mnb.predict(X.todense())
labels_validate_pred = mnb.predict(features_validate_vectorized_cdrb.todense())

In [None]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())
print('Accuracy (on validation set):', (labels_validate_pred==labels_validate).sum()/labels_validate.count())

Massive overfit!

# Random Forest

In [4]:
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
rfc.fit(X.todense(), labels_train)

In [None]:
labels_train_pred = rfc.predict(X.todense())
labels_validate_pred = rfc.predict(features_validate_vectorized_cdrb.todense())

In [None]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())
print('Accuracy (on validation set):', (labels_validate_pred==labels_validate).sum()/labels_validate.count())

Massive overfit!

## EDA following TF-IDF

In [None]:
[text for text in dffraud[dffraud['requirements'].str.find('URL')!=-1]['requirements']]