### Yunhe Jia

In [20]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import spacy
import re
import math
import time

In [9]:
data = pd.read_csv('./fake_job_postings.csv')
data = data.fillna('')

In [147]:
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,comb
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,"Food52, a fast-growing, James Beard Award-winn..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,Organised - Focused - Vibrant - Awesome!Do you...
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,"Our client, located in Houston, is actively se..."
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,THE COMPANY: ESRI – Environmental Systems Rese...
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,JOB TITLE: Itemization Review ManagerLOCATION:...


combine three columns for model input

In [10]:
data['comb'] = data['description']+data['requirements']+data['benefits']
df = data[['comb','fraudulent']]

In [16]:
df.head()

Unnamed: 0,comb,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0


### Data Processing

In [21]:
# disabling some fancy features of spacy for speed
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

# nlp = spacy.load('en_core_web_sm')
rows = []
for idx in tqdm(range(len(df))):
    row = df.iloc[idx].copy()
    
    # first we remove numeric characters and lowercase everything
    cleaned_head = re.sub("[^A-Za-z']+", ' ', row['comb']).replace("'", ' ').lower()
    
    # we let spaCy tokenize and lemmatize the text for us
    tokenized_head = nlp(cleaned_head)
    cleaned_tokenized = [token.lemma_ for token in tokenized_head if ((not token.is_stop) or (' ' in token.text))]
    
    if len(cleaned_tokenized) > 1:
        row['cleaned'] = ' '.join(cleaned_tokenized)
    rows.append(row)
df_clean = pd.DataFrame(rows)
df_clean = df_clean[~df_clean.cleaned.isna()]
df_clean.head()

  0%|          | 0/17880 [00:00<?, ?it/s]

Unnamed: 0,comb,fraudulent,cleaned
0,"Food52, a fast-growing, James Beard Award-winn...",0,food fast grow james beard award win online fo...
1,Organised - Focused - Vibrant - Awesome!Do you...,0,organise focus vibrant awesome passion custome...
2,"Our client, located in Houston, is actively se...",0,client locate houston actively seek experience...
3,THE COMPANY: ESRI – Environmental Systems Rese...,0,company esri environmental system research ins...
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0,job title itemization review managerlocation f...


In [22]:
df_clean.to_csv('./job_cleaned.csv')

In [23]:
from sklearn.model_selection import train_test_split

df_train,df_valid = train_test_split(df_clean,train_size=0.7,random_state=30)
df_valid,df_test = train_test_split(df_valid,train_size=0.5,random_state=30)

## Prepare for Training

In [24]:
from collections import Counter

headlines = [headline.split(' ') for headline in list(df_train['cleaned'])]
word_freq = Counter([token for headline in headlines for token in headline]).most_common()

# # remove words that appear infrequently
word_freq = dict(word_freq)
# print(len(word_freq))
min_freq = 5
word_dict = {}

# # sending all the unknowns to 0
i = 1
for word in word_freq:
    if word_freq[word] > min_freq:
        word_dict[word] = i
        i += 1
    else:
        word_dict[word] = 0

# dictionary length        
dict_length = max(word_dict.values()) + 1
dict_length

12635

In [25]:
max_length = 0
for idx in tqdm(range(len(df_train))):
#     print ()
    row = df_train.iloc[idx]
#     print (row)
    length = len(row['cleaned'].split(' '))
    if length > max_length:
        max_length = length
print(max_length)

  0%|          | 0/12513 [00:00<?, ?it/s]

994


In [26]:
class JobDataset(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        posting = row['cleaned'].split(' ')
        x = torch.zeros(self.max_len)
        
        for idx in range(len(posting)):
            
            # we want to front pad for RNN
            x[self.max_len - len(posting) + idx] = self.word_dict.get(posting[idx],0)
            
        y = torch.tensor(row['fraudulent']).float()
        
        # embedding likes long tensors
        return x.long(), y


## RNN model with GRU layer

In [27]:
class BIGRU(nn.Module):
    def __init__(self,word_dict, embedding_size, hidden_size):
        super(BIGRU, self).__init__()
        
        self.word_dict = word_dict
        self.hidden_size = hidden_size
        
        # integer to word dictionary
        self.idx2word = dict([(x, y) for x, y in zip(self.word_dict.values(), self.word_dict.keys())])
        self.idx2word[0] = 'UNK'
        
        # length of dictionary
        dict_length = max(word_dict.values()) + 1
        
        # embed the words
        self.emb = nn.Embedding(dict_length, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(hidden_size, 1) # input dim is 64*2 because its bidirectional

    def forward(self, x, h):
        x = self.emb(x)
        x, h = self.gru(x, h)
        x = self.dropout(x[:,-1,:].squeeze()) # just get the last hidden state
        x = self.linear(x) # sigmoid output for binary classification
        return x.squeeze()

    def init_hidden(self):
        return autograd.Variable(torch.randn(1, 1000, 100))

In [151]:
def one_pass(model,h, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    total_incorrect = 0
    tp = 0
    fp = 0
    fn = 0
    len_ = 0
    for x, y in tqdm(dataloader):
        if len(x)==1000:
            y_pred = model(x,h)
            loss = lossFun(y_pred, y.float())
            total_loss += loss.item()

            if backwards == True:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            y_pred = torch.sigmoid(y_pred).round().long()
            y = y.long()
            total_incorrect += torch.count_nonzero(y - y_pred).item()
            tp += (y & y_pred).sum()
            fp += (y_pred & (~y)).sum()
            fn = ((~y_pred) & y).sum()
            len_ += x.shape[0]
    
        percent_wrong = total_incorrect / len_

        avg_loss = total_loss / len(dataloader)

        if print_loss == True:
            print(avg_loss)
    
    return np.round(1-percent_wrong,2),np.round((tp/(tp+fp)).numpy(),2),np.round((tp/(tp+fn)).numpy(),2)

def one_pass_acc(model,h,dataloader):
    model.eval()
    total_incorrect = 0
    total_loss = 0.0
    total_incorrect = 0
    tp = 0
    fp = 0
    fn = 0
    len_ = 0    
    for x, y in dataloader:
        print (len(x))
        if len(x)==1000:
            y_pred = torch.sigmoid(model(x,h)).round().long()
            y = y.long()
            total_incorrect += torch.count_nonzero(y - y_pred).item()
            tp += (y & y_pred).sum()
            fp += (y_pred & (~y)).sum()
            fn = ((~y_pred) & y).sum()
            len_ += x.shape[0]
    
        percent_wrong = total_incorrect / len_

        avg_loss = total_loss / len(dataloader)
    
    return np.round(1-percent_wrong,2),np.round((tp/(tp+fp)).numpy(),2),np.round((tp/(tp+fn)).numpy(),2)

In [152]:
ds_train = JobDataset(df_train, word_dict, max_length)
ds_valid = JobDataset(df_valid, word_dict, max_length)
dl_train = DataLoader(ds_train, batch_size=1000, shuffle=True)
dl_valid = DataLoader(ds_valid, batch_size=1000, shuffle=False)

In [153]:
lossFun = nn.BCEWithLogitsLoss()
model = BIGRU(word_dict, embedding_size=100, hidden_size=100)
optimizer = optim.Adam(model.parameters(), lr = 0.01)

In [154]:
h = model.init_hidden()

num_epochs = 10

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    acc_train,prec_train,recall_train = one_pass(model,h,dl_train, optimizer, lossFun)
    acc_val,prec_val,recall_val = one_pass_acc(model,h, dl_valid)
    print('train accuracy: ', acc_train,'train precision: ',prec_train,'train recall: ',recall_train)
    print('valid accuracy: ', acc_val,'valid precision: ',prec_val,'valid recall: ',recall_val)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  0.9 train precision:  0.06 train recall:  0.56
valid accuracy:  0.95 valid precision:  0.71 valid recall:  0.08
Epoch:  1


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  0.96 train precision:  0.86 train recall:  0.67
valid accuracy:  0.96 valid precision:  0.87 valid recall:  0.3
Epoch:  2


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  0.96 train precision:  0.84 train recall:  0.85
valid accuracy:  0.97 valid precision:  0.93 valid recall:  0.53
Epoch:  3


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  0.97 train precision:  0.89 train recall:  0.89
valid accuracy:  0.97 valid precision:  0.95 valid recall:  0.49
Epoch:  4


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  0.98 train precision:  0.92 train recall:  0.94
valid accuracy:  0.97 valid precision:  0.89 valid recall:  0.62
Epoch:  5


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  0.99 train precision:  0.95 train recall:  0.99
valid accuracy:  0.98 valid precision:  0.91 valid recall:  0.7
Epoch:  6


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  0.99 train precision:  0.98 train recall:  0.99
valid accuracy:  0.98 valid precision:  0.9 valid recall:  0.72
Epoch:  7


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  1.0 train precision:  0.98 train recall:  1.0
valid accuracy:  0.98 valid precision:  0.87 valid recall:  0.74
Epoch:  8


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  1.0 train precision:  0.99 train recall:  1.0
valid accuracy:  0.98 valid precision:  0.88 valid recall:  0.72
Epoch:  9


  0%|          | 0/13 [00:00<?, ?it/s]

1000
1000
682
train accuracy:  1.0 train precision:  0.99 train recall:  1.0
valid accuracy:  0.97 valid precision:  0.88 valid recall:  0.69


In [62]:
ds_test = JobDataset(df_test, word_dict, max_length)
dl_test = DataLoader(ds_test, batch_size=1000, shuffle=False)

In [157]:
acc_val,prec_val,recall_val = one_pass_acc(model,h, dl_test)

In [156]:
recall_val

0.78

### CNN Model

In [122]:
class Parameters:
    # Preprocessing parameeters
    seq_len: int = 994
    num_words: int = 73210

    # Model parameters
    embedding_size: int = 128
    out_size: int = 32
    stride: int = 2

    # Training parameters
    epochs: int = 10
    batch_size: int = 12
    learning_rate: float = 0.001

In [123]:
params = Parameters()

In [136]:
class CNN(nn.Module):
    def __init__(self,word_dict):
        super(CNN, self).__init__()
        
        self.word_dict = word_dict
        
      # Parameters regarding text preprocessing
        self.seq_len = params.seq_len
        self.num_words = params.num_words
        self.embedding_size = params.embedding_size

        # Dropout definition
        self.dropout = nn.Dropout(0.25)

        # CNN parameters definition
        # Kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 3
        self.kernel_4 = 2

        # Output size for each convolution
        self.out_size = params.out_size
        # Number of strides for each convolution
        self.stride = params.stride

        # Embedding layer definition
        self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)

        # Convolution layers definition
        self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)

        # Max pooling layers definition
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)

        # Fully connected layer definition
        self.fc = nn.Linear(self.in_features_fc(), 1)

    def forward(self, x):
        # Sequence of tokes is filterd through an embedding layer
        x = self.embedding(x)

        # Convolution layer 1 is applied
        x1 = self.conv_1(x)
        x1 = torch.relu(x1)
        x1 = self.pool_1(x1)

        # Convolution layer 2 is applied
        x2 = self.conv_2(x)
        x2 = torch.relu((x2))
        x2 = self.pool_2(x2)

        # Convolution layer 3 is applied
        x3 = self.conv_3(x)
        x3 = torch.relu(x3)
        x3 = self.pool_3(x3)

        # Convolution layer 4 is applied
        x4 = self.conv_4(x)
        x4 = torch.relu(x4)
        x4 = self.pool_4(x4)

        # The output of each convolutional layer is concatenated into a unique vector
        union = torch.cat((x1, x2, x3, x4), 2)
        union = union.reshape(union.size(0), -1)

        # The "flattened" vector is passed through a fully connected layer
        out = self.fc(union)
        # Dropout is applied
        out = self.dropout(out)
        # Activation function is applied
#         out = torch.sigmoid(out)

        return out.squeeze()

    def in_features_fc(self):
        '''Calculates the number of output features after Convolution + Max pooling

        Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1

        source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        '''
        # Calcualte size of convolved/pooled features for convolution_1/max_pooling_1 features
        out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)

        # Calcualte size of convolved/pooled features for convolution_2/max_pooling_2 features
        out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_conv_2 = math.floor(out_conv_2)
        out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_pool_2 = math.floor(out_pool_2)

        # Calcualte size of convolved/pooled features for convolution_3/max_pooling_3 features
        out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_conv_3 = math.floor(out_conv_3)
        out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_pool_3 = math.floor(out_pool_3)

        # Calcualte size of convolved/pooled features for convolution_4/max_pooling_4 features
        out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_conv_4 = math.floor(out_conv_4)
        out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_pool_4 = math.floor(out_pool_4)

    # Returns "flattened" vector (input for fully connected layer)
        return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size

In [137]:
def one_pass(model,dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    total_incorrect = 0
    tp = 0
    fp = 0
    fn = 0
    len_ = 0
    for x, y in tqdm(dataloader):
        y_pred = model(x)
        loss = lossFun(y_pred, y.float())
        total_loss += loss.item()

        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        y_pred = torch.sigmoid(y_pred).round().long()
        y = y.long()
        total_incorrect += torch.count_nonzero(y - y_pred).item()
        tp += (y & y_pred).sum()
        fp += (y_pred & (~y)).sum()
        fn = ((~y_pred) & y).sum()
        len_ += x.shape[0]
    
    percent_wrong = total_incorrect / len_

    avg_loss = total_loss / len(dataloader)

    if print_loss == True:
        print(avg_loss)
    
    return np.round(1-percent_wrong,2),np.round((tp/(tp+fp)).numpy(),2),np.round((tp/(tp+fn)).numpy(),2)

def one_pass_acc(model,dataloader):
    model.eval()
    total_incorrect = 0
    total_loss = 0.0
    total_incorrect = 0
    tp = 0
    fp = 0
    fn = 0
    len_ = 0    
    for x, y in dataloader:
        y_pred = torch.sigmoid(model(x)).round().long()
        y = y.long()
        total_incorrect += torch.count_nonzero(y - y_pred).item()
        tp += (y & y_pred).sum()
        fp += (y_pred & (~y)).sum()
        fn = ((~y_pred) & y).sum()
        len_ += x.shape[0]
    
    percent_wrong = total_incorrect / len_

    avg_loss = total_loss / len(dataloader)
    print (avg_loss)
    return np.round(1-percent_wrong,2),np.round((tp/(tp+fp)).numpy(),2),np.round((tp/(tp+fn)).numpy(),2)

In [140]:
lossFun = nn.BCEWithLogitsLoss()
model = CNN(word_dict)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
num_epochs = 10
start = time.time()
for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    acc_train,prec_train,recall_train = one_pass(model,dl_train, optimizer, lossFun,print_loss=True)
    acc_val,prec_val,recall_val = one_pass_acc(model,dl_valid)
    print('train accuracy: ', acc_train,'train precision: ',prec_train,'train recall: ',recall_train)
    print('valid accuracy: ', acc_val,'valid precision: ',prec_val,'valid recall: ',recall_val)
end = time.time()
print (end-start)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/13 [00:00<?, ?it/s]

0.8121737195895269
0.0
train accuracy:  0.9 train precision:  0.06 train recall:  0.63
valid accuracy:  0.95 valid precision:  nan valid recall:  0.0
Epoch:  1


  0%|          | 0/13 [00:00<?, ?it/s]

0.36265923426701474
0.0
train accuracy:  0.95 train precision:  1.0 train recall:  0.11
valid accuracy:  0.95 valid precision:  1.0 valid recall:  0.13
Epoch:  2


  0%|          | 0/13 [00:00<?, ?it/s]

0.24677899135993078
0.0
train accuracy:  0.96 train precision:  0.98 train recall:  0.89
valid accuracy:  0.97 valid precision:  0.96 valid recall:  0.76
Epoch:  3


  0%|          | 0/13 [00:00<?, ?it/s]

0.20322123972269204
0.0
train accuracy:  0.98 train precision:  0.99 train recall:  0.97
valid accuracy:  0.97 valid precision:  1.0 valid recall:  0.79
Epoch:  4


  0%|          | 0/13 [00:00<?, ?it/s]

0.18441756413533136
0.0
train accuracy:  0.98 train precision:  0.99 train recall:  0.97
valid accuracy:  0.97 valid precision:  0.95 valid recall:  0.81
Epoch:  5


  0%|          | 0/13 [00:00<?, ?it/s]

0.180301036972266
0.0
train accuracy:  0.99 train precision:  1.0 train recall:  0.98
valid accuracy:  0.97 valid precision:  0.98 valid recall:  0.81
Epoch:  6


  0%|          | 0/13 [00:00<?, ?it/s]

0.17522832063528207
0.0
train accuracy:  0.99 train precision:  1.0 train recall:  0.98
valid accuracy:  0.97 valid precision:  1.0 valid recall:  0.79
Epoch:  7


  0%|          | 0/13 [00:00<?, ?it/s]

0.17563050297590402
0.0
train accuracy:  0.99 train precision:  1.0 train recall:  0.98
valid accuracy:  0.97 valid precision:  0.98 valid recall:  0.81
Epoch:  8


  0%|          | 0/13 [00:00<?, ?it/s]

0.1734794401205503
0.0
train accuracy:  0.99 train precision:  1.0 train recall:  0.99
valid accuracy:  0.97 valid precision:  1.0 valid recall:  0.81
Epoch:  9


  0%|          | 0/13 [00:00<?, ?it/s]

0.17394971732909864
0.0
train accuracy:  0.99 train precision:  1.0 train recall:  0.98
valid accuracy:  0.97 valid precision:  1.0 valid recall:  0.81
531.6453187465668


In [141]:
one_pass_acc(model,dl_test)

0.0


(0.97, 0.99, 0.76)