In [1]:
import pandas as pd
import contractions
import re
import stopwords
import ast

In [2]:
data = pd.read_excel("../Data/fake_job_postings.xlsx")
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


There is noise present in the textual columns, which do not provide useful information and will hinder data pre-precessing. We will carry out the following data cleaning steps:

- Expand Contractions
- Remove URL
- Remove Non-ASCI Characters
- Remove Special Characters
- Remove Extra Spaces
- Convert to Lower Case
- Remove Duplicate Punctuations

In [3]:
textual_columns = ["company_profile", "description", "requirements", "benefits", "department"]

data[textual_columns] = data[textual_columns].astype(str)
cleaned_dataset = data.copy()

In [4]:
def dataset_cleaning(column_name):
  
    # generate noise-free cleaned_dataset
    print('removing noise')

    # remove non-ASCI characters - should be done first
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub(r'[^\x00-\x7f]', ' ', x))

    # expand contractions
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: contractions.fix(x))

    # remove URL
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', ' ', x))

    # remove special characters
    regex_special = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        u"\ufe0f"  # dingbats
        "]+", flags = re.UNICODE)
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: regex_special.sub(' ', x))

    # remove extra spaces
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub('\r\n', ' ', x))
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub('\n', ' ', x))
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub(' +', ' ', x))

    # convert to lower case
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: x.lower())

    # remove duplicated punctuations
    regex_punctuation = re.compile(r'''([!#$%&'()*+,./:;<=>?@[\]^_`{|}~-])[!"#$%&'()*+,./:;<=>?@[\]^_`{|}~-]+''')
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: regex_punctuation.sub(r'\1', x))

In [5]:
def clean_scraped_text(column_name):

    # Remove URLs in the format #url_...
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub(r'#url_[a-f0-9]+#', '', x))

    # Replace punctuation with spaces
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    # Remove newline characters
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: x.replace('\n', ' '))

    # Remove extra spaces
    cleaned_dataset[column_name] = cleaned_dataset[column_name].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [6]:
for col in textual_columns:
    dataset_cleaning(col)
    clean_scraped_text(col)

removing noise
removing noise
removing noise
removing noise
removing noise


In [7]:
cleaned_dataset

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",marketing,,we are food52 and we have created a groundbrea...,food52 a fast growing james beard award winnin...,experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",success,,90 seconds the worlds cloud video production s...,organised focused vibrant awesome do you have ...,what we expect from you your key responsibilit...,what you will get from usthrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,valor services provides workforce solutions th...,our client located in houston is actively seek...,implement pre commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",sales,,our passion for improving quality of life thro...,the company esri environmental systems researc...,education bachelor s or master s in gis busine...,our culture is anything but corporate we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,spotsource solutions llc is a global human cap...,job title itemization review managerlocation f...,qualifications right now license in the state ...,full benefits offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",sales,,vend is looking for some awesome new talent to...,just in case this is the first time you ve vis...,to ace this role you will eat comprehensive st...,what can you expect from us we have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",accounting,,weblinc is the e commerce platform and service...,the payroll accountant will focus primarily on...,b a or b s in accounting desire to have fun wh...,health amp wellnessmedical planprescription dr...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,we provide full time permanent positions for m...,experienced project cost control staff enginee...,at least 12 years professional experience abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,nemsia studios is looking for an experienced v...,1 must be fluent in the latest versions of cor...,competitive salary compensation will be based ...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [8]:
cleaned_dataset.to_excel("../Data/gerald_data_cleaned.xlsx")