In [1]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Setup

In [2]:
# split dataset in legit vs fraudulent job posts
df = pd.read_csv("fake_job_postings.csv",index_col='job_id')
dflegit = df[df['fraudulent']==0]
dffraud = df[df['fraudulent']==1]

# fillna with NoText to allow for modeling
dffraud['requirements'].fillna(value='NoText', inplace=True)
dflegit['requirements'].fillna(value='NoText', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [3]:
# Creates a balanced dataset - UNDERSAMPLING
lil_dflegit = dflegit.sample(n=len(dffraud), random_state=666)
lil_df = pd.concat([lil_dflegit, dffraud])
lil_df['fraudulent'].value_counts()

1    866
0    866
Name: fraudulent, dtype: int64

In [4]:
lil_dflegit.index

Int64Index([10352, 11791, 11459,   451, 13692,   325,  7543, 12771, 16087,
             9751,
            ...
              868,  4796,  2859,  6732,  2705,  9531, 13583, 15707, 11989,
            16892],
           dtype='int64', name='job_id', length=866)

In [5]:
labels = lil_df['fraudulent']
features = lil_df.drop(columns=['fraudulent'])

# Train Test Split

In [6]:
features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                    labels,
                                                    random_state=666)

features_train, features_validate, labels_train, labels_validate = train_test_split(features_train,
                                                           labels_train,
                                                           random_state=666
                                                           )

# Initialising K-Folds
# kfold = KFold(n_splits=5,
#               random_state=666,
#               shuffle=True
#              )

# TF-IDF computation

In [7]:
vect = TfidfVectorizer(input='content', stop_words='english')

In [8]:
#fit_transform
features_train_vectorized = vect.fit_transform([document for document in features_train['requirements']])
#transform
features_validate_vectorized = vect.transform([document for document in features_validate['requirements']])
features_test_vectorized = vect.transform([document for document in features_test['requirements']])

## Shape check

In [9]:
print('requirements in train set:', len([document for document in features_train['requirements']]))
print('shape of densified train [[tfidf]]:', features_train_vectorized.todense().shape)

requirements in train set: 974
shape of densified train [[tfidf]]: (974, 7741)


In [10]:
print('requirements in validation set:', len([document for document in features_validate['requirements']]))
print('shape of densified validation [[tfidf]]:', features_validate_vectorized.todense().shape)

requirements in validation set: 325
shape of densified validation [[tfidf]]: (325, 7741)


In [11]:
print('requirements in test set:', len([document for document in features_test['requirements']]))
print('shape of densified test [[tfidf]]:', features_test_vectorized.todense().shape)

requirements in test set: 433
shape of densified test [[tfidf]]: (433, 7741)


# Gaussian Naive Bayes

In [12]:
gnb = GaussianNB()

In [13]:
gnb.fit(features_train_vectorized.todense(), labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [14]:
labels_train_pred = gnb.predict(features_train_vectorized.todense())
labels_validate_pred = gnb.predict(features_validate_vectorized.todense())

In [15]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())
print('Accuracy (on validation set):', (labels_validate_pred==labels_validate).sum()/labels_validate.count())

Accuracy (on train set): 0.9209445585215605
Accuracy (on validation set): 0.7876923076923077


Massive overfit!

# Multinomial Naive Bayes

In [16]:
mnb = MultinomialNB()

In [17]:
mnb.fit(features_train_vectorized.todense(), labels_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
labels_train_pred = mnb.predict(features_train_vectorized.todense())
labels_validate_pred = mnb.predict(features_validate_vectorized.todense())

In [19]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())
print('Accuracy (on validation set):', (labels_validate_pred==labels_validate).sum()/labels_validate.count())

Accuracy (on train set): 0.893223819301848
Accuracy (on validation set): 0.7846153846153846


Massive overfit!

# Random Forest

In [20]:
rfc = RandomForestClassifier(n_estimators=100)

In [21]:
rfc.fit(features_train_vectorized.todense(), labels_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
labels_train_pred = rfc.predict(features_train_vectorized.todense())
labels_validate_pred = rfc.predict(features_validate_vectorized.todense())

In [23]:
print('Accuracy (on train set):', (labels_train_pred==labels_train).sum()/labels_train.count())
print('Accuracy (on validation set):', (labels_validate_pred==labels_validate).sum()/labels_validate.count())

Accuracy (on train set): 0.9281314168377823
Accuracy (on validation set): 0.7876923076923077


Massive overfit!

## EDA following TF-IDF

In [24]:
[text for text in dffraud[dffraud['requirements'].str.find('URL')!=-1]['requirements']]

['Position Requirements: Experience supporting and developing on a version of CA’s service management software including SDM, CMDB, ITAM, ITPAM, and CAPA – version 12 a plusOO design conceptsJava, C# , VB.net, &amp;.Net Framework skills requiredMicrosoft Reporting ServicesBusiness Objects XIRDBMS – SQL Server and Oracle preferredXML – Web services and SOAP#URL_86fd830a95a64e2b30ceed829e63fd384c289e4f01e3c93608b42a84f6e662dd# a plusKnowledge of ITIL and/or Six Sigma and SDLCStrong interpersonal skillsHighly motivated and self-directedKnowledge of data securityStrong written and oral communication skillsProven analytical and problem-solving abilitiesExperience migrating a problem/change management system from one version to another and/or to a new product. Not required, but a plus.Knowledge of other problem and change management systems is a plus (ie, HP, IBM,\xa0 Symantec, etc..)\xa0\xa0',
 'Job RequirementsTechnical Requirements:Working knowledge of: MapInfo GIS, ArcGIS, SMT Kingdom Su