# Data Cleaning and Preparing

## Step 1: initialization 

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np

import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
#initialize the lematizer and stemmer, which will be used later.
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

#to be used in the cleaning function
nltk.download('stopwords')
nltk.download('punkt')

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /Users/bill/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/bill/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Step 2: Prepare the Data
The dataset, EMSCAD, is downloaded from Kaggle, the link is here:
https://www.kaggle.com/datasets/amruthjithrajvr/recruitment-scam

In [3]:
#load the EMSCAD dataset
df = pd.read_csv('/Users/bill/Desktop/project_code/DataSet.csv')
print("The shape of the dataframe is",df.shape) 

The shape of the dataframe is (17880, 18)


### Get the dataframe for only the Job description column.

In [4]:
df_jd = pd.DataFrame(df['description'])
print("The shape of the dataframe is", df_jd.shape) 
print("Display one Job description sample:\n") 
print(df_jd['description'][0]) 

The shape of the dataframe is (17880, 1)
Display one Job description sample:

<p>Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.</p>
<ul>
<li>Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems</li>
<li>Researching blogs and websites for the Provisions by Food52 Affiliate Program</li>
<li>Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries</li>
<li>Supporting with PR &amp; Events when needed</li>
<li>Helping with office administrative work, such as filing, mailing, and preparing for meetings</li>
<li>Working with developers to document bugs and suggest improvement

### To display the Job description sample in a more readable way.

In [5]:
from IPython.core.display import display, HTML
display(HTML(df_jd['description'][0]))

### Prepare the cleaning function

In [6]:
# remove the HTML tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)


def clean(text):
    
    # remove the HTML tags
    text = striphtml(text)
    
    # Lowercase text
    text = text.lower()
    
    # Remove punctuation
    text = text.replace(':', ' ')
    text = text.replace('\'', ' ')
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Remove extra spaces from text
    text = " ".join(text.split())
    
    # Remove stopwords function
    # Tokenize : get a list of tokens
    stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
    word_tokens = word_tokenize(text)
    text = [word for word in word_tokens if word not in stop_words]
    
    return text

In [7]:
clean(df_jd['description'][0])

['food52',
 'fastgrowing',
 'james',
 'beard',
 'awardwinning',
 'online',
 'food',
 'community',
 'crowdsourced',
 'curated',
 'recipe',
 'hub',
 'currently',
 'interviewing',
 'full',
 'parttime',
 'unpaid',
 'interns',
 'work',
 'small',
 'team',
 'editors',
 'executives',
 'developers',
 'new',
 'york',
 'city',
 'headquarters',
 'reproducing',
 'andor',
 'repackaging',
 'existing',
 'food52',
 'content',
 'number',
 'partner',
 'sites',
 'huffington',
 'post',
 'yahoo',
 'buzzfeed',
 'various',
 'content',
 'management',
 'systems',
 'researching',
 'blogs',
 'websites',
 'provisions',
 'food52',
 'affiliate',
 'program',
 'assisting',
 'daytoday',
 'affiliate',
 'program',
 'support',
 'screening',
 'affiliates',
 'assisting',
 'affiliate',
 'inquiries',
 'supporting',
 'pr',
 'amp',
 'events',
 'needed',
 'helping',
 'office',
 'administrative',
 'work',
 'filing',
 'mailing',
 'preparing',
 'meetings',
 'working',
 'developers',
 'document',
 'bugs',
 'suggest',
 'improvements'

##  For the purpose of demonstration, I did a cleaning process step by step

In [8]:
# define a new variable "text_jd" that will only be used here in the demonstration part
text_jd = df_jd['description'][0]
print(text_jd)

<p>Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.</p>
<ul>
<li>Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems</li>
<li>Researching blogs and websites for the Provisions by Food52 Affiliate Program</li>
<li>Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries</li>
<li>Supporting with PR &amp; Events when needed</li>
<li>Helping with office administrative work, such as filing, mailing, and preparing for meetings</li>
<li>Working with developers to document bugs and suggest improvements to the site</li>
<li>Supporting the marketing and executive staff</li>
</u

In [9]:
# remove the HTML tags
text_jd = striphtml(text_jd)
print(text_jd)

Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.

Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems
Researching blogs and websites for the Provisions by Food52 Affiliate Program
Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries
Supporting with PR &amp; Events when needed
Helping with office administrative work, such as filing, mailing, and preparing for meetings
Working with developers to document bugs and suggest improvements to the site
Supporting the marketing and executive staff



In [10]:
# Lowercase text
text_jd = text_jd.lower()
print(text_jd)

food52, a fast-growing, james beard award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its new york city headquarters.

reproducing and/or repackaging existing food52 content for a number of partner sites, such as huffington post, yahoo, buzzfeed, and more in their various content management systems
researching blogs and websites for the provisions by food52 affiliate program
assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries
supporting with pr &amp; events when needed
helping with office administrative work, such as filing, mailing, and preparing for meetings
working with developers to document bugs and suggest improvements to the site
supporting the marketing and executive staff



In [11]:
# Remove ':', '\', and punctuation
text_jd = text_jd.replace(':', ' ')
text_jd = text_jd.replace('\'', ' ')
translator = str.maketrans('', '', string.punctuation)
text_jd = text_jd.translate(translator)
print(text_jd)

food52 a fastgrowing james beard awardwinning online food community and crowdsourced and curated recipe hub is currently interviewing full and parttime unpaid interns to work in a small team of editors executives and developers in its new york city headquarters

reproducing andor repackaging existing food52 content for a number of partner sites such as huffington post yahoo buzzfeed and more in their various content management systems
researching blogs and websites for the provisions by food52 affiliate program
assisting in daytoday affiliate program support such as screening affiliates and assisting in any affiliate inquiries
supporting with pr amp events when needed
helping with office administrative work such as filing mailing and preparing for meetings
working with developers to document bugs and suggest improvements to the site
supporting the marketing and executive staff



In [12]:
# Remove extra spaces from text
text_jd = " ".join(text_jd.split())
print(text_jd)

food52 a fastgrowing james beard awardwinning online food community and crowdsourced and curated recipe hub is currently interviewing full and parttime unpaid interns to work in a small team of editors executives and developers in its new york city headquarters reproducing andor repackaging existing food52 content for a number of partner sites such as huffington post yahoo buzzfeed and more in their various content management systems researching blogs and websites for the provisions by food52 affiliate program assisting in daytoday affiliate program support such as screening affiliates and assisting in any affiliate inquiries supporting with pr amp events when needed helping with office administrative work such as filing mailing and preparing for meetings working with developers to document bugs and suggest improvements to the site supporting the marketing and executive staff


In [13]:
# set stopwords 
stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
print(stop_words)

{"won't", 'shouldn', 'then', 'its', 'each', 'with', 'ain', 'them', "isn't", 'weren', 'yourselves', 'between', 'it', 'these', 'who', "wouldn't", 'as', 'having', 'my', 'that', 'those', 'have', "didn't", 'be', "shouldn't", 'so', "haven't", 'in', 'now', 'when', 'by', 'ourselves', 'aren', "don't", 'off', 'of', 'been', 'll', 'wasn', 'herself', 'hadn', 'down', "doesn't", 'through', 'most', 'over', 'd', 'can', 'same', 'needn', 'his', 'the', "you'll", 'yours', 'few', 's', "that'll", 'doesn', 'him', "shan't", 'again', 'doing', 'own', 'ma', 'our', 'did', 'to', 'than', 'before', 'about', 'what', "wasn't", 'above', 'out', "you're", 'me', 'further', 'too', 'she', 'until', 'theirs', 'only', "hadn't", 'but', 'such', 'some', 'was', 'hers', 'and', 'your', 'is', 'will', 'up', 'y', 'i', 'if', "hasn't", 'am', 'ours', 'into', 'you', 'mightn', 'at', 'an', 'nor', "should've", 'were', 'all', 'after', 'below', 'both', "mustn't", "it's", 'under', 'just', 'because', 'this', "needn't", 'or', "you've", 've', 'here'

In [17]:
# Tokenize : get a list of tokens
# Remove stop words
word_tokens = word_tokenize(text_jd)
text_jd_without_stopwords = [word for word in word_tokens if word not in stop_words]
print(text_jd_without_stopwords)

['food52', 'fastgrowing', 'james', 'beard', 'awardwinning', 'online', 'food', 'community', 'crowdsourced', 'curated', 'recipe', 'hub', 'currently', 'interviewing', 'full', 'parttime', 'unpaid', 'interns', 'work', 'small', 'team', 'editors', 'executives', 'developers', 'new', 'york', 'city', 'headquarters', 'reproducing', 'andor', 'repackaging', 'existing', 'food52', 'content', 'number', 'partner', 'sites', 'huffington', 'post', 'yahoo', 'buzzfeed', 'various', 'content', 'management', 'systems', 'researching', 'blogs', 'websites', 'provisions', 'food52', 'affiliate', 'program', 'assisting', 'daytoday', 'affiliate', 'program', 'support', 'screening', 'affiliates', 'assisting', 'affiliate', 'inquiries', 'supporting', 'pr', 'amp', 'events', 'needed', 'helping', 'office', 'administrative', 'work', 'filing', 'mailing', 'preparing', 'meetings', 'working', 'developers', 'document', 'bugs', 'suggest', 'improvements', 'site', 'supporting', 'marketing', 'executive', 'staff']


In [19]:
# Lemmatize words
text_jd_without_stopwords = [lemmatizer.lemmatize(word, pos ='v') for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

['food52', 'fastgrowing', 'jam', 'beard', 'awardwinning', 'online', 'food', 'community', 'crowdsourced', 'curated', 'recipe', 'hub', 'currently', 'interview', 'full', 'parttime', 'unpaid', 'intern', 'work', 'small', 'team', 'editors', 'executives', 'developers', 'new', 'york', 'city', 'headquarter', 'reproduce', 'andor', 'repackaging', 'exist', 'food52', 'content', 'number', 'partner', 'sit', 'huffington', 'post', 'yahoo', 'buzzfeed', 'various', 'content', 'management', 'systems', 'research', 'blog', 'websites', 'provision', 'food52', 'affiliate', 'program', 'assist', 'daytoday', 'affiliate', 'program', 'support', 'screen', 'affiliate', 'assist', 'affiliate', 'inquiries', 'support', 'pr', 'amp', 'events', 'need', 'help', 'office', 'administrative', 'work', 'file', 'mail', 'prepare', 'meet', 'work', 'developers', 'document', 'bug', 'suggest', 'improvements', 'site', 'support', 'market', 'executive', 'staff']


In [21]:
# Stem words 
text_jd_without_stopwords = [stemmer.stem(word) for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

['food52', 'fastgrow', 'jam', 'beard', 'awardwin', 'onlin', 'food', 'commun', 'crowdsourc', 'curat', 'recip', 'hub', 'current', 'interview', 'full', 'parttim', 'unpaid', 'intern', 'work', 'small', 'team', 'editor', 'execut', 'develop', 'new', 'york', 'citi', 'headquart', 'reproduc', 'andor', 'repackag', 'exist', 'food52', 'content', 'number', 'partner', 'sit', 'huffington', 'post', 'yahoo', 'buzzf', 'variou', 'content', 'manag', 'system', 'research', 'blog', 'websit', 'provi', 'food52', 'affili', 'program', 'assist', 'daytoday', 'affili', 'program', 'support', 'screen', 'affili', 'assist', 'affili', 'inquiri', 'support', 'pr', 'amp', 'event', 'need', 'help', 'offic', 'administr', 'work', 'file', 'mail', 'prepar', 'meet', 'work', 'develop', 'document', 'bug', 'suggest', 'improv', 'site', 'support', 'market', 'execut', 'staff']
