In [16]:
import torch
import re
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
import numpy as np
import pandas as pd
from copy import deepcopy

SMALL_TEXT_VECTOR_SIZE = 10
MEDIUM_TEXT_VECTOR_SIZE = 25
LARGE_TEXT_VECTOR_SIZE = 50


class JobPosting:
	def __init__(self, title="", location="", department="", salary_range="", company_profile="",
				 description="", requirement="", benefit="", telecommuting="", has_company_logo="",
				 has_question="", employment_type="", required_experience="",
				 required_education="", industry="", function="", fraudulent="job_pridiction"):
		self.title = title if isinstance(title, str) else ""
		self.location = location if isinstance(location, str) else ""
		self.department = department if isinstance(department, str) else ""
		self.salary_range = [salary_range if isinstance(salary_range, str) else ""]
		self.company_profile = company_profile if isinstance(company_profile, str) else ""
		self.description = description if isinstance(description, str) else ""
		self.requirement = requirement if isinstance(requirement, str) else ""
		self.benefit = benefit if isinstance(benefit, str) else ""
		self.telecommuting = telecommuting if isinstance(telecommuting, str) else ""
		self.has_company_logo = has_company_logo if str(isinstance(has_company_logo, int)) else ""
		self.has_question = has_question if isinstance(has_question, str) else ""
		self.employment_type = employment_type if isinstance(employment_type, str) else ""
		self.required_experience = required_experience if isinstance(required_experience, str) else ""
		self.required_education = required_education if isinstance(required_education, str) else ""
		self.industry = industry if isinstance(industry, str) else ""
		self.function = function if isinstance(function, str) else ""
		self.fraudulent = fraudulent if isinstance(fraudulent, str) else ""

	def get_data_list(self):
		return [*self.title, *self.location, *self.department, *self.salary_range, *self.company_profile,
				*self.description, *self.requirement, *self.benefit, self.telecommuting, self.has_company_logo,
				self.has_company_logo, self.has_question, *self.employment_type, *self.required_experience,
				*self.required_education, *self.industry, *self.function]

	def get_target_list(self):
		if int(self.fraudulent) == 0:
			return [1.0, 0.0]
		else:
			return [0.0, 1.0]

	def get_target(self):
		return int(self.fraudulent)

	def get_data_tensor(self):
		return torch.tensor(self.get_data_list())


class JobPostingsDataset(torch.utils.data.Dataset):
	def __init__(self, job_postings_list=[]):
		self.job_postings_list = job_postings_list
		self.vectorized_job_postings_dict = {}

	def __len__(self):
		return len(self.job_postings_list)

	def prepare_all_text_vectorizers(self):
		self.title_vectorizer = TfidfVectorizer(max_features=MEDIUM_TEXT_VECTOR_SIZE)
		all_titles_list = [self.preprocess_text(job_posting.title) for job_posting in self.job_postings_list]
		self.title_vectorizer.fit_transform(all_titles_list)
		
		self.location_vectorizer = TfidfVectorizer(max_features=MEDIUM_TEXT_VECTOR_SIZE)
		all_locations_list = [self.preprocess_text(job_posting.location) for job_posting in self.job_postings_list]
		self.location_vectorizer.fit_transform(all_locations_list)
		
		self.department_vectorizer = TfidfVectorizer(max_features=SMALL_TEXT_VECTOR_SIZE)
		all_departments_list = [self.preprocess_text(job_posting.department) for job_posting in self.job_postings_list]
		self.department_vectorizer.fit_transform(all_departments_list)

		self.company_profile_vectorizer = TfidfVectorizer(max_features=LARGE_TEXT_VECTOR_SIZE)
		all_company_profiles_list = [self.preprocess_text(job_posting.company_profile) for job_posting in self.job_postings_list]
		self.company_profile_vectorizer.fit_transform(all_company_profiles_list)

		self.description_vectorizer = TfidfVectorizer(max_features=LARGE_TEXT_VECTOR_SIZE)
		all_descriptions_list = [self.preprocess_text(job_posting.description) for job_posting in self.job_postings_list]
		self.description_vectorizer.fit_transform(all_descriptions_list)

		self.requirement_vectorizer = TfidfVectorizer(max_features=LARGE_TEXT_VECTOR_SIZE)
		all_requirements_list = [self.preprocess_text(job_posting.requirement) for job_posting in self.job_postings_list]
		self.requirement_vectorizer.fit_transform(all_requirements_list)

		self.benefit_vectorizer = TfidfVectorizer(max_features=LARGE_TEXT_VECTOR_SIZE)
		all_benefits_list = [self.preprocess_text(job_posting.benefit) for job_posting in self.job_postings_list]
		self.benefit_vectorizer.fit_transform(all_benefits_list)

		self.employment_type_vectorizer = TfidfVectorizer(max_features=SMALL_TEXT_VECTOR_SIZE)
		all_employment_types_list = [self.preprocess_text(job_posting.employment_type) for job_posting in self.job_postings_list]
		self.employment_type_vectorizer.fit_transform(all_employment_types_list)

		self.required_experience_vectorizer = TfidfVectorizer(max_features=SMALL_TEXT_VECTOR_SIZE)
		all_required_experiences_list = [self.preprocess_text(job_posting.required_experience) for job_posting in self.job_postings_list]
		self.required_experience_vectorizer.fit_transform(all_required_experiences_list)

		self.required_education_vectorizer = TfidfVectorizer(max_features=SMALL_TEXT_VECTOR_SIZE)
		all_required_educations_list = [self.preprocess_text(job_posting.required_education) for job_posting in self.job_postings_list]
		self.required_education_vectorizer.fit_transform(all_required_educations_list)

		self.industry_vectorizer = TfidfVectorizer(max_features=MEDIUM_TEXT_VECTOR_SIZE)
		all_industries_list = [self.preprocess_text(job_posting.industry) for job_posting in self.job_postings_list]
		self.industry_vectorizer.fit_transform(all_industries_list)

		self.function_vectorizer = TfidfVectorizer(max_features=MEDIUM_TEXT_VECTOR_SIZE)
		all_functions_list = [self.preprocess_text(job_posting.function) for job_posting in self.job_postings_list]
		self.function_vectorizer.fit_transform(all_functions_list)

	@staticmethod
	def preprocess_text(text):
		text = re.sub("<[^>]*>", "", text)
		symbols = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
		text = (re.sub("[\W]+", " ", text.lower()) + " ".join(symbols).replace("-", ""))
		return text

	def __getitem__(self, index):
		if index in self.vectorized_job_postings_dict:
			return self.vectorized_job_postings_dict[index]
		else:
			vectorized_job_posting = deepcopy(self.job_postings_list[index])
			vectorized_job_posting.title = self.title_vectorizer.transform([vectorized_job_posting.title]).toarray()[0]
			vectorized_job_posting.location = self.location_vectorizer.transform([vectorized_job_posting.location]).toarray()[0]
			vectorized_job_posting.department = self.department_vectorizer.transform([vectorized_job_posting.department]).toarray()[0]
			vectorized_job_posting.company_profile = self.company_profile_vectorizer.transform([vectorized_job_posting.company_profile]).toarray()[0]
			vectorized_job_posting.description = self.description_vectorizer.transform([vectorized_job_posting.description]).toarray()[0]
			vectorized_job_posting.requirement = self.requirement_vectorizer.transform([vectorized_job_posting.requirement]).toarray()[0]
			vectorized_job_posting.benefit = self.benefit_vectorizer.transform([vectorized_job_posting.benefit]).toarray()[0]
			vectorized_job_posting.employment_type = self.employment_type_vectorizer.transform([vectorized_job_posting.employment_type]).toarray()[0]
			vectorized_job_posting.required_experience = self.required_experience_vectorizer.transform([vectorized_job_posting.required_experience]).toarray()[0]
			vectorized_job_posting.required_education = self.required_education_vectorizer.transform([vectorized_job_posting.required_education]).toarray()[0]
			vectorized_job_posting.industry = self.industry_vectorizer.transform([vectorized_job_posting.industry]).toarray()[0]
			vectorized_job_posting.function = self.function_vectorizer.transform([vectorized_job_posting.function]).toarray()[0]
			if len(list(vectorized_job_posting.salary_range[0].split("-"))) == 2:
				try:
					vectorized_job_posting.salary_range = tuple(map(int, vectorized_job_posting.salary_range[0].split("-")))
				except:
					vectorized_job_posting.salary_range = (0, 0)
			else:
				vectorized_job_posting.salary_range = (0, 0)
			vectorized_job_posting.telecommuting = int(vectorized_job_posting.telecommuting)
			vectorized_job_posting.has_company_logo = int(vectorized_job_posting.has_company_logo)
			vectorized_job_posting.has_question = int(vectorized_job_posting.has_question)
			vectorized_job_posting.fraudulent = int(vectorized_job_posting.fraudulent)
			self.vectorized_job_postings_dict[index] = vectorized_job_posting
			return self.vectorized_job_postings_dict[index]

In [20]:
import torch
from torch import nn, optim
from torch.nn import functional as F

NETWORK_INPUT_SIZE = 342
NETWORK_OUTPUT_SIZE = 2


class Network(nn.Module):
	def __init__(self, input_size=NETWORK_INPUT_SIZE, output_size=NETWORK_OUTPUT_SIZE):
		super(Network, self).__init__()
		self.fc1 = nn.Linear(input_size, 256)
		self.fc2 = nn.Linear(256, 128)
		self.fc3 = nn.Linear(128, 64)
		self.fc4 = nn.Linear(64, 32)
		self.fc5 = nn.Linear(32, 16)
		self.fc6 = nn.Linear(16, 8)
		self.fc7 = nn.Linear(8, 4)
		self.fc8 = nn.Linear(4, output_size)

	def forward(self, x):
		x = self.fc1(x)
		x = F.relu(x)
		x = self.fc2(x)
		x = F.relu(x)
		x = self.fc3(x)
		x = F.relu(x)
		x = self.fc4(x)
		x = F.relu(x)
		x = self.fc5(x)
		x = F.relu(x)
		x = self.fc6(x)
		x = F.relu(x)
		x = self.fc7(x)
		x = F.relu(x)
		x = self.fc8(x)
		return x




In [21]:
def normal_init(m, mean, std):
	if isinstance(m, nn.Linear):
		m.weight.data.normal_(mean, std)
		m.bias.data.zero_()

In [24]:
import torch
from torch import nn, optim
import pandas as pd
import random
from pycm import ConfusionMatrix
from skorch import NeuralNetClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score


CSV_FILENAME = "dataset.csv"
CSV_FILE_DELIMITER = ","
EPOCHS_COUNT = 200
LEARNING_RATE = 0.0001
BATCH_SIZE = 32
KFOLD_PARTITIONS_COUNT = 5
FAKE_TO_REAL_RATIO = 1.0/20.0
TRAINING_PARALLEL_JOBS_COUNT = -1 # -1 means maximum


def prepare_dataset():
	global job_postings
	global all_job_posting_data
	global all_job_posting_targets
	random.shuffle(job_postings)
	job_postings = JobPostingsDataset(job_postings)
	job_postings.prepare_all_text_vectorizers()
	all_job_posting_data = torch.tensor([job_posting.get_data_list() for job_posting in job_postings]).float().to(device)
	all_job_posting_targets = torch.tensor([job_posting.get_target_list() for job_posting in job_postings]).float().to(device)


def read_job_postings_data_from_csv(csv_filename=CSV_FILENAME, csv_file_delimiter=CSV_FILE_DELIMITER):
	global job_postings
	job_postings = []
	csv_contents = pd.read_csv(csv_filename, delimiter=csv_file_delimiter, dtype=str)
	rows = len(csv_contents)
	cols = len(csv_contents.iloc[0])
	for i in range(rows):
		data = csv_contents.iloc[i]
		job_posting = JobPosting(data["title"], data["location"], data["department"], data["salary_range"], data["company_profile"], data["description"], data["requirements"],
								 data["benefits"], data["telecommuting"], data["has_company_logo"], data["has_questions"], data["employment_type"], data["required_experience"],
								 data["required_education"], data["industry"], data["function"], data["fraudulent"])
		job_postings.append(job_posting)


def initialize_network():
	global device
	global model
	global optimizer
	global criterion
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	model = Network().to(device)
	criterion = torch.nn.BCEWithLogitsLoss
	optimizer = torch.optim.Adam
	for m in model._modules:
		normal_init(model._modules[m], 0, 1)


def print_confusion_matrix(actual_labels, predicted_labels):
	cm = ConfusionMatrix(actual_labels, predicted_labels)
	print(cm)


if __name__ == "__main__":
	print("Initializing network")
	initialize_network()
	print("Reading CSV file")
	read_job_postings_data_from_csv()
	print("Preparing dataset")
	prepare_dataset()
	print("Creating classifier")
	classifier = NeuralNetClassifier(Network, max_epochs=EPOCHS_COUNT, lr=LEARNING_RATE,
									 train_split=None, criterion=criterion, optimizer=optimizer,
									 batch_size=BATCH_SIZE, criterion__pos_weight=torch.tensor([FAKE_TO_REAL_RATIO, 1.0]))
	print("Training")
	predictions = cross_val_predict(classifier, all_job_posting_data, all_job_posting_targets,
									cv=KFOLD_PARTITIONS_COUNT, verbose=1, n_jobs=TRAINING_PARALLEL_JOBS_COUNT)
	predicted_labels = torch.from_numpy(predictions).tolist()
	actual_labels = torch.argmax(all_job_posting_targets, dim=1).tolist()
	print_confusion_matrix(actual_labels, predicted_labels)

Initializing network
Reading CSV file
Preparing dataset
Creating classifier
Training


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  8.9min remaining: 13.4min


Predict      0            1            [0, 0]       [0, 1]       [1, 0]       [1, 1]       
Actual
0            0            0            1418         79           15516        1            

1            0            0            131          360          375          0            

[0, 0]       0            0            0            0            0            0            

[0, 1]       0            0            0            0            0            0            

[1, 0]       0            0            0            0            0            0            

[1, 1]       0            0            0            0            0            0            





Overall Statistics : 

95% CI                                                            (0.0,0.0)
ACC Macro                                                         0.66667
ARI                                                               0.27879
AUNP                                                              None
AUNU                 

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 17.5min finished
