In [1]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [3]:
df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Job Postings NLP Project/fake_job_postings.csv")

Removing rows with erroneous entries


In [4]:
df['description'][14127]

'v\\:* {behavior:url(#default#VML);}o\\:* {behavior:url(#default#VML);}w\\:* {behavior:url(#default#VML);}.shape {behavior:url(#default#VML);}   800x600    Normal  0          false  false  false    EN-US  X-NONE  X-NONE                                 MicrosoftInternetExplorer4                                                                                                                                                                                                                                                                                                                            /* Style Definitions */ #URL_22932ad710cc8bab5012d10e1dc768a71064c391fef21e0fceddb0e7a66f97b6#{mso-style-name:"Table Normal";mso-tstyle-rowband-size:0;mso-tstyle-colband-size:0;mso-style-noshow:yes;mso-style-priority:99;mso-style-parent:"";mso-padding-alt:0in 5.4pt 0in 5.4pt;mso-para-margin:0in;mso-para-margin-bottom:.0001pt;mso-pagination:widow-orphan;font-size:10.0pt;font-family:"Times New Roman","ser

In [5]:
df['description'][1574]

'DEPARTMENT:\xa0 \xa0\xa0\xa0\xa0MaintenanceREPORTS TO:\xa0\xa0\xa0\xa0\xa0\xa0 Maintenance ManagerLOCATION:\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Niagara Falls, NYPOSITIONS:\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 Maintenance MechanicAbout us: Niacet is a leading producer of organic salts, including propionates and acetates, serving the Food, Pharmaceutical and Technical industries. With two longstanding and fully automated manufacturing sites, located in Niagara Falls, NY USA, and Tiel, The Netherlands, Niacet offers world-class quality products to a global market.Our products fill vital needs in a broad range of applications that are essential to everyday life including food preservation, antibiotic formulation, dialysis treatment, energy production, and more.Maintenance Mechanic Position:Niagara Falls chemical manufacturer is looking for experienced Maintenance mechanic.\xa0General Job duties include, but are not limited to:--Knowledge of process piping, autovalves and dry packaging 

In [6]:
df['description'][1230]

'We currently have a vacancy for \xa0JAVA/J2EE developer fluent in English, to offer his/her services as an expert who will be based in Brussels. The work will be carried out either in the company’s premises or on site at the customer premises. In the context of the first assignment, the successful candidate will be integrated into the development team of the company that will closely cooperate with a major client’s IT team on site.\xa0Your tasks:Analysis of business processes, documentation and provision of recommendations for the following development phases;Diagnosis of software problems, provision of assistance in deploying and configuring systems, application and software modules;Prototyping and development following the design guidelines;Contribute to the continuous improvement of the GUI of Web based applications and resolve operational issues;Advise the Line Manager and the hierarchy on technological evolutions in Oracle WebLogic domain;Ensure proper project management and moni

In [7]:
df.drop([14127, 1574, 1230], inplace=True)

Examining the data


In [8]:
fake=df['fraudulent'].sum()
real=len(df)-fake
print("real: {0}, fake: {1}".format(fake, real))

real: 866, fake: 17011


The dataset is very unbalanced. We will attempt to balance it by having equal #s of each real and fake postings.


In [9]:
df = df.sample(frac=1).reset_index(drop=True)

In [10]:
df=df.set_index('job_id')

In [11]:
dropped = 0

for index, row in df.iterrows():
  if (df['fraudulent'][index] == 0) and (dropped < real-fake):
    df.drop(index, inplace=True)
    dropped+=1

In [12]:
fake=df['fraudulent'].sum()
real=len(df)-fake
print("real: {0}, fake: {1}".format(fake, real))

real: 866, fake: 866


Clean the text fields


In [13]:
import re

def text_tokenizer(text):
  if not (text == "" or pd.isnull(text)): 
    text = re.sub(r'URL_[A-Za-z0-9]+', ' ', text)
    return re.sub(r'[^A-Za-z0-9]+', ' ', text).lower().strip()

In [14]:
df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [15]:
cols = ['title', 'location', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function']

for col in cols:
  for index, row in df.iterrows():
    cleaned = text_tokenizer(df[col][index])
    df[col] = df[col].replace(df[col][index],cleaned)

In [16]:
df = df.sample(frac=1).reset_index(drop=True)

In [17]:
df['text'] = df['title'] + ' ' + df['location'] + ' ' + df['department'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits'] + ' ' + df['employment_type'] + ' ' + df['required_experience']+ ' ' + df['required_education'] + ' ' + df['industry'] + ' ' + df['function']

In [18]:
df=df.drop(['location','department','company_profile','description','requirements','benefits','employment_type', 'required_experience','required_education', 'industry', 'function'], 1)

Splitting into test, train, val


In [19]:
print("total num of rows: {0}, train size: {1}, test size: {2}".format(len(df), len(df)*0.8, len(df)*0.2))

total num of rows: 1732, train size: 1385.6000000000001, test size: 346.40000000000003


In [20]:
train = df[:1385]
train = train.sample(frac=1).reset_index(drop=True)
test = df[1386:]
test = test.sample(frac=1).reset_index(drop=True)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(train['text'].values.astype('U'))
test_vectors = vectorizer.transform(test['text'].values.astype('U'))

In [22]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train['fraudulent'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(test['fraudulent'], prediction_linear, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

Training time: 2.287817s; Prediction time: 0.516347s
positive:  {'precision': 0.9807692307692307, 'recall': 0.95625, 'f1-score': 0.9683544303797469, 'support': 160}
negative:  {'precision': 0.9631578947368421, 'recall': 0.9838709677419355, 'f1-score': 0.9734042553191489, 'support': 186}


In [23]:
import numpy as np
np.matrix(classifier_linear.predict(test_vectors) - test['fraudulent']).sum()

-4

In [24]:
fake_entry = "Part-Time.Online Data entry operator needed. Work remotely from your location entering data for various companies and hospitals. *The job will only take 1hr of your time daily and it's between Monday and Friday or on weekend if that works best for you. *The job can be done in your free time. Send your resume and all inquiries to: bronsonsmaxwells2@gmail DOT com Skills needed Good in Excel Accuracy Basic computer knowledge Part-time Hours: 7 weekly average"
fake_entry = text_tokenizer(fake_entry)
review_vector = vectorizer.transform([fake_entry])

#review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

[0]


In [25]:
print("fraudulent: {0}, real: {1}".format(train['fraudulent'].sum(), len(train)-train['fraudulent'].sum()))

fraudulent: 705, real: 680


In [30]:
from sklearn.externals import joblib
joblib.dump(classifier_linear, '/content/gdrive/My Drive/Colab Notebooks/Job Postings NLP Project/model.pkl')

['/content/gdrive/My Drive/Colab Notebooks/Job Postings NLP Project/model.pkl']