In [161]:
!pip install pandas

!pip install -U scikit-learn

!pip install pyenchant



In [162]:
import os
import sklearn
import pandas as pd
import csv
import enchant
import language_tool_python
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score


In [106]:
'''Loading Data into a DataFrame'''
jobsDF = pd.read_csv("archive/fake_job_postings.csv")
jobsDF.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [318]:
#List of urgent phrases we are testing on
urgency_phrases = ['urgent', 'immediate start','once in a lifetime','hurry up',
                   'last chance','don\'t miss out','limited time','quick response',
                   'immediate action','rapid response','must apply','quick hire'
                  ]

#List of jargon we are testing on
jargon = ['handsome', 'dude', 'awesome', 'chill','lame','stoked','cool','hyped',
          'massive','insane','sick','epic','stupid','dumb','buddy','totally','weird',
          'meh','gnarly','bummer','funky','vibes','chitchat','sketchy'          
         ]

#List of "get rich" phrases we are testing on
riches = ['get rich quick','millionare','zero effort','instant wealth','guaranteed profit',
         'easy money','passive income','huge returns','risk free','quick cash','get paid daily',
         'no risks']

In [348]:
'''
Adding feature columns to the Dateframe
'''

#Token Length Column
def token_length(df):
    df['description_length'] = df['description'].apply(lambda x: len(str(x).split()))

#Description token_ratio
def token_ratio(df):
    df['token_ratio'] = df['description'].apply(lambda x: len(set(str(x).split())) / len(str(x).split()) if len(str(x).split()) > 0 else 0)

#Spelling Error Column
def spelling_error_count(df):
    d = enchant.Dict('en_US')
    spell_errors_count = []
    for description in df['description']:
        desc_words = str(description).split()
        errors_in_desc = sum(not d.check(word) for word in desc_words)
        errors_count.append(errors_in_desc)
    return errors_count

#Check for specific urgent words
def check_urgent(desc):
    return sum(phrase in str(desc).lower() for phrase in urgency_phrases)

#Check for specific urgent jargon
def check_jargon(desc):
    return sum(phrase in str(desc).lower() for phrase in jargon)

#Check for specific promise words
def check_promises(desc):
    return sum(phrase in str(desc).lower() for phrase in riches)

#
def capital_percentage_feature_new(dataset): 
    final = [] 
    for ex in dataset ['description']: 
        description = str(ex). split() 
        total_characters = len(description) 
        capital_count = sum(1 for char in description if char.isupper()) 
        if total_characters == 0: 
            final.append(0.0) 
        else: 
            final.append((capital_count / total_characters) * 100) 
    return final


def repeat_feature(dataset): 
    features = [] 
    for example in dataset['description']: 
        text = str(example) 
        repetitions = {} 
        for i in range(len(text) - 1): 
            pair = text[i:i+2] 
            if pair in repetitions: 
                repetitions[pair] += 1 
            else: repetitions[pair] = 1 
        max_repetitions = max(repetitions. values()) if repetitions else 0 
        features.append(max_repetitions) 
    return features


In [122]:
#All the implementations of the features outlined above
token_length(jobsDF)
jobsDF['spelling_errors'] = spelling_error_count(jobsDF)
jobsDF['urgent'] = jobsDF['description'].apply(check_urgent).astype(int)
jobsDF['jargon'] = jobsDF['description'].apply(check_jargon).astype(int)
jobsDF['promises'] = jobsDF['description'].apply(check_promises).astype(int)
token_ratio(jobsDF)
jobsDF['capitals_percentage'] = capital_percentage_feature_new(jobsDF)
jobsDF['repeated_word_count'] = repeat_feature(jobsDF)

In [370]:
jobsDF.head(2)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,function,fraudulent,description_length,spelling_errors,urgent,jargon,promises,token_ratio,capitals_percentage,repeated_word_count
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,...,Marketing,0,124,25,0,0,0,0.741935,0.806452,31
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,...,Customer Service,0,315,38,0,2,0,0.634921,0.31746,43


In [361]:
'''Splitting the data into train and test sets
train_test_split randomly samples jobsDF and splits it into two new data sets 
test_size = 0.10 because we want 10% of our data to be for testing
'''
train, test = train_test_split(jobsDF, test_size=0.10)


In [362]:
'''
splitting again to get our validation set
'''
train, validation = train_test_split(train, test_size=0.111)
print(train.shape[0]) #checking that we have a 80-10-10 split for train valid and test sets
print(test.shape[0])
print(validation.shape[0])

14305
1788
1787


In [368]:
#Check to see if there are enough fradulent records included in the validation set
validation['fraudulent'].value_counts()

fraudulent
0    1701
1      86
Name: count, dtype: int64

In [363]:
'''
We need to have our labels on their own for the clf.fit function below
we do this by creating two new dataframes labels and feature
feature is just our other columns which are our defacto features rn
'''
feature_matrix = ['has_company_logo','telecommuting','has_questions',
                  'description_length','spelling_errors','urgent','jargon',
                  'promises','token_ratio','capitals_percentage','repeated_word_count']

labels = train["fraudulent"] 
feature = train[feature_matrix]
#feature = train.drop(columns=['fraudulent'])

In [364]:
'''
we make a clf object which is a randomforestclassifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

'''
clf = RandomForestClassifier(max_depth=60, random_state=0)
clf.fit(feature,labels)
clf.score(feature,labels)

0.999930094372597

In [365]:
'''
We do the same thing for our validation data
'''
labels1 = validation["fraudulent"]
feature1 = validation[feature_matrix]
clf.score(feature1,labels1)

0.9759373251259094

In [366]:
'''
Our score is already super high because there its only measuring accuracy and there are way more non-fraudulent 
jobs than there are fraudulent ones so we need to check the f1 score instead.
'''
# Create our predictions 
prediction = clf.predict(feature1) 
# Create confusion matrix from sklearn.metrics 

confusion_matrix(labels1, prediction)
# Display accuracy score 
accuracy_score(labels1, prediction)
# Display F1 score 
score = f1_score(labels1,prediction)
print(score)

0.6861313868613139
