# Data Cleaning and Preparing

## Step 1: initialization 

In [1]:
import pandas as pd
import numpy as np

import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
#initialize the lematizer and stemmer, which will be used later.
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

#to be used in the cleaning function
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

print(stopwords.words('english'))
#print(stopwords.words('chinese'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Step 2: Clean the Data
The dataset, EMSCAD, is downloaded from Kaggle, the link is here:
https://www.kaggle.com/datasets/amruthjithrajvr/recruitment-scam

In [3]:
#load the EMSCAD dataset
df = pd.read_csv('./DataSet.csv')
print("The shape of the dataframe is",df.shape) 

The shape of the dataframe is (17880, 18)


### Get the dataframe for only the Job description column.

In [4]:
df_jd = pd.DataFrame(df['description'])
print("The shape of the dataframe is", df_jd.shape) 
print("Display one Job description sample:\n") 
print(df_jd['description'][0]) 

The shape of the dataframe is (17880, 1)
Display one Job description sample:

<p>Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.</p>
<ul>
<li>Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems</li>
<li>Researching blogs and websites for the Provisions by Food52 Affiliate Program</li>
<li>Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries</li>
<li>Supporting with PR &amp; Events when needed</li>
<li>Helping with office administrative work, such as filing, mailing, and preparing for meetings</li>
<li>Working with developers to document bugs and suggest improvement

### To display the Job description sample in a more readable way.

In [5]:
from IPython.core.display import display, HTML
display(HTML(df_jd['description'][0]))

### Prepare the cleaning function

In [6]:
# remove the HTML tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)


def clean(text):
    
    # remove the HTML tags
    text = striphtml(text)
    
    # Lowercase text
    text = text.lower()
    
    # Remove punctuation
    text = text.replace(':', ' ')
    text = text.replace('\'', ' ')
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Remove extra spaces from text
    text = " ".join(text.split())
    
    # Remove stopwords function
    # Tokenize : get a list of tokens
    stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
    word_tokens = word_tokenize(text)
    text = [word for word in word_tokens if word not in stop_words]
    
    # Lemmatize words
    text = [lemmatizer.lemmatize(word, pos ='v') for word in text]
    
    # Stem words
    text = [stemmer.stem(word) for word in text]
    
    return text

In [62]:
def simple_clean(text):
    # Lowercase text
    text = text.lower()
    
    # Remove numbers
    #text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text

In [7]:
# for now, I only used some of the examples (df_jd['description'][1:100]), due to the computing capability.

df_jd_sentence = pd.DataFrame()

for index, sentence in df_jd['description'][1:5].iteritems(): 
    #df_jd_sentence.add(clean(sentence))
    print(clean(sentence))


['organis', 'focu', 'vibrant', 'awesomedo', 'passion', 'custom', 'servic', 'slick', 'type', 'skill', 'mayb', 'account', 'manag', 'think', 'administr', 'cooler', 'polar', 'bear', 'jetski', 'need', 'hear', 'cloud', 'video', 'product', 'servic', 'opper', 'glodal', 'level', 'yeah', 'pretti', 'cool', 'seriou', 'deliv', 'world', 'class', 'product', 'excel', 'custom', 'serviceour', 'rapidli', 'expand', 'busi', 'look', 'talent', 'project', 'manag', 'manag', 'success', 'deliveri', 'video', 'project', 'manag', 'client', 'commun', 'drive', 'product', 'process', 'work', 'coolest', 'brand', 'planet', 'learn', 'global', 'team', 'repres', 'nz', 'huge', 'way', 'enter', 'next', 'growth', 'stage', 'busi', 'grow', 'quickli', 'intern', 'therefor', 'posit', 'burst', 'opportun', 'right', 'person', 'enter', 'busi', 'right', 'time', '90', 'second', 'world', 'cloud', 'video', 'product', 'servic', 'http', '90urlfbe6559afac620a3cd2c22281f7b8d0eef56a73e3d9a311e2f1ca13d081dd630', '90', 'second', 'world', 'cloud', 

##  For the purpose of demonstration, I did a cleaning process step by step

In [8]:
# define new variables, "text_jd" and "text_jd_without_stopwords", which will only be used here for demonstration
# and only use one job description as example.
text_jd = df_jd['description'][0]
print(text_jd)

<p>Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.</p>
<ul>
<li>Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems</li>
<li>Researching blogs and websites for the Provisions by Food52 Affiliate Program</li>
<li>Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries</li>
<li>Supporting with PR &amp; Events when needed</li>
<li>Helping with office administrative work, such as filing, mailing, and preparing for meetings</li>
<li>Working with developers to document bugs and suggest improvements to the site</li>
<li>Supporting the marketing and executive staff</li>
</u

In [9]:
# remove the HTML tags
text_jd = striphtml(text_jd)
print(text_jd)

Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.

Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems
Researching blogs and websites for the Provisions by Food52 Affiliate Program
Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries
Supporting with PR &amp; Events when needed
Helping with office administrative work, such as filing, mailing, and preparing for meetings
Working with developers to document bugs and suggest improvements to the site
Supporting the marketing and executive staff



In [10]:
# Lowercase text
text_jd = text_jd.lower()
print(text_jd)

food52, a fast-growing, james beard award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its new york city headquarters.

reproducing and/or repackaging existing food52 content for a number of partner sites, such as huffington post, yahoo, buzzfeed, and more in their various content management systems
researching blogs and websites for the provisions by food52 affiliate program
assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries
supporting with pr &amp; events when needed
helping with office administrative work, such as filing, mailing, and preparing for meetings
working with developers to document bugs and suggest improvements to the site
supporting the marketing and executive staff



In [11]:
# Remove ':', '\', and punctuation
text_jd = text_jd.replace(':', ' ')
text_jd = text_jd.replace('\'', ' ')
translator = str.maketrans('', '', string.punctuation)
text_jd = text_jd.translate(translator)
print(text_jd)

food52 a fastgrowing james beard awardwinning online food community and crowdsourced and curated recipe hub is currently interviewing full and parttime unpaid interns to work in a small team of editors executives and developers in its new york city headquarters

reproducing andor repackaging existing food52 content for a number of partner sites such as huffington post yahoo buzzfeed and more in their various content management systems
researching blogs and websites for the provisions by food52 affiliate program
assisting in daytoday affiliate program support such as screening affiliates and assisting in any affiliate inquiries
supporting with pr amp events when needed
helping with office administrative work such as filing mailing and preparing for meetings
working with developers to document bugs and suggest improvements to the site
supporting the marketing and executive staff



In [12]:
# Remove extra spaces from text
text_jd = " ".join(text_jd.split())
print(text_jd)

food52 a fastgrowing james beard awardwinning online food community and crowdsourced and curated recipe hub is currently interviewing full and parttime unpaid interns to work in a small team of editors executives and developers in its new york city headquarters reproducing andor repackaging existing food52 content for a number of partner sites such as huffington post yahoo buzzfeed and more in their various content management systems researching blogs and websites for the provisions by food52 affiliate program assisting in daytoday affiliate program support such as screening affiliates and assisting in any affiliate inquiries supporting with pr amp events when needed helping with office administrative work such as filing mailing and preparing for meetings working with developers to document bugs and suggest improvements to the site supporting the marketing and executive staff


In [13]:
# set stopwords 
stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
print(stop_words)

{"hasn't", "couldn't", 'him', 've', 'being', 'didn', 'here', 'will', 'herself', 'shouldn', 'shan', "aren't", 'each', 'don', 'some', 'same', 'all', "you'd", 'you', 'hasn', 'we', "you'll", 'can', 'needn', "shouldn't", 'd', 'a', 'of', 'doesn', "doesn't", 'out', 's', 'his', "she's", 'themselves', 'isn', 'only', 'why', 'were', 'ma', 'over', 'where', 'other', "should've", 'my', 'after', 'such', 'll', 'the', 'm', 'at', 'or', 'few', 'i', 'she', 'down', 'won', 'once', 'weren', 'which', 'be', 'how', "that'll", 'very', "you're", 'he', 'by', 'because', "don't", 'these', 'against', 'own', 'then', 'but', 'if', 'to', 'those', 'both', 'our', 'mightn', 'when', 'until', 'aren', 'me', 'into', 'have', 'yours', 'before', 'through', "isn't", 'wouldn', 'hers', 'what', 'ain', 'their', 'any', 'o', 'wasn', 'had', "wouldn't", 'that', 'her', 'is', "didn't", 'couldn', 'above', "it's", 'theirs', 'on', 'about', 'off', 'do', 'it', 'ours', 'just', 'further', "mustn't", 'too', 'this', 'them', 'below', 'itself', 'there'

In [14]:
# Tokenize : get a list of tokens
# Remove stop words
word_tokens = word_tokenize(text_jd)
text_jd_without_stopwords = [word for word in word_tokens if word not in stop_words]
print(text_jd_without_stopwords)

['food52', 'fastgrowing', 'james', 'beard', 'awardwinning', 'online', 'food', 'community', 'crowdsourced', 'curated', 'recipe', 'hub', 'currently', 'interviewing', 'full', 'parttime', 'unpaid', 'interns', 'work', 'small', 'team', 'editors', 'executives', 'developers', 'new', 'york', 'city', 'headquarters', 'reproducing', 'andor', 'repackaging', 'existing', 'food52', 'content', 'number', 'partner', 'sites', 'huffington', 'post', 'yahoo', 'buzzfeed', 'various', 'content', 'management', 'systems', 'researching', 'blogs', 'websites', 'provisions', 'food52', 'affiliate', 'program', 'assisting', 'daytoday', 'affiliate', 'program', 'support', 'screening', 'affiliates', 'assisting', 'affiliate', 'inquiries', 'supporting', 'pr', 'amp', 'events', 'needed', 'helping', 'office', 'administrative', 'work', 'filing', 'mailing', 'preparing', 'meetings', 'working', 'developers', 'document', 'bugs', 'suggest', 'improvements', 'site', 'supporting', 'marketing', 'executive', 'staff']


In [15]:
# Lemmatize words
text_jd_without_stopwords = [lemmatizer.lemmatize(word, pos ='v') for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

['food52', 'fastgrowing', 'jam', 'beard', 'awardwinning', 'online', 'food', 'community', 'crowdsourced', 'curated', 'recipe', 'hub', 'currently', 'interview', 'full', 'parttime', 'unpaid', 'intern', 'work', 'small', 'team', 'editors', 'executives', 'developers', 'new', 'york', 'city', 'headquarter', 'reproduce', 'andor', 'repackaging', 'exist', 'food52', 'content', 'number', 'partner', 'sit', 'huffington', 'post', 'yahoo', 'buzzfeed', 'various', 'content', 'management', 'systems', 'research', 'blog', 'websites', 'provision', 'food52', 'affiliate', 'program', 'assist', 'daytoday', 'affiliate', 'program', 'support', 'screen', 'affiliate', 'assist', 'affiliate', 'inquiries', 'support', 'pr', 'amp', 'events', 'need', 'help', 'office', 'administrative', 'work', 'file', 'mail', 'prepare', 'meet', 'work', 'developers', 'document', 'bug', 'suggest', 'improvements', 'site', 'support', 'market', 'executive', 'staff']


In [16]:
# Stem words 
text_jd_without_stopwords = [stemmer.stem(word) for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

['food52', 'fastgrow', 'jam', 'beard', 'awardwin', 'onlin', 'food', 'commun', 'crowdsourc', 'curat', 'recip', 'hub', 'current', 'interview', 'full', 'parttim', 'unpaid', 'intern', 'work', 'small', 'team', 'editor', 'execut', 'develop', 'new', 'york', 'citi', 'headquart', 'reproduc', 'andor', 'repackag', 'exist', 'food52', 'content', 'number', 'partner', 'sit', 'huffington', 'post', 'yahoo', 'buzzfe', 'variou', 'content', 'manag', 'system', 'research', 'blog', 'websit', 'provis', 'food52', 'affili', 'program', 'assist', 'daytoday', 'affili', 'program', 'support', 'screen', 'affili', 'assist', 'affili', 'inquiri', 'support', 'pr', 'amp', 'event', 'need', 'help', 'offic', 'administr', 'work', 'file', 'mail', 'prepar', 'meet', 'work', 'develop', 'document', 'bug', 'suggest', 'improv', 'site', 'support', 'market', 'execut', 'staff']


# Step 2: preparing the data

In [17]:
# get the list of biased words or phrases
df_biased_words = pd.read_excel("./bias_words.xlsx")
print(df_biased_words.shape)
print(df_biased_words.head())
print(df_biased_words['Masculine/Feminine Bias'].value_counts())

(159, 2)
  Biased Words or Phrases Masculine/Feminine Bias
0                  active          Masculine Bias
1             adventurous          Masculine Bias
2                 aggress          Masculine Bias
3                 ambitio          Masculine Bias
4                   analy          Masculine Bias
Masculine Bias    95
Feminine Bias     64
Name: Masculine/Feminine Bias, dtype: int64


In [18]:
# exclude generic he/she and dupulicated words in df_biased_words
words_to_exclude = ['she', 'her', 'hers', 'herself', 'he', 'himself', 'him', 'his']
df_biased_words = df_biased_words[~df_biased_words['Biased Words or Phrases'].isin(words_to_exclude)]
df_biased_words = df_biased_words.drop_duplicates()

print(df_biased_words.shape)
print(df_biased_words.isna().any())# check if there's any empty cells
print(df_biased_words.head())
print(df_biased_words['Masculine/Feminine Bias'].value_counts())

(137, 2)
Biased Words or Phrases    False
Masculine/Feminine Bias    False
dtype: bool
  Biased Words or Phrases Masculine/Feminine Bias
0                  active          Masculine Bias
1             adventurous          Masculine Bias
2                 aggress          Masculine Bias
3                 ambitio          Masculine Bias
4                   analy          Masculine Bias
Masculine Bias    81
Feminine Bias     56
Name: Masculine/Feminine Bias, dtype: int64


In [19]:
# get the list of male_words and female words
male_words = (df_biased_words.loc[df_biased_words['Masculine/Feminine Bias'] == 'Masculine Bias'])['Biased Words or Phrases'].values
female_words = (df_biased_words.loc[df_biased_words['Masculine/Feminine Bias'] == 'Feminine Bias'])['Biased Words or Phrases'].values

In [20]:
print(male_words)

['active' 'adventurous' 'aggress' 'ambitio' 'analy' 'assert' 'athlet'
 'autonom' 'boast' 'challeng' 'compet' 'confident' 'courag' 'decide'
 'decisive' 'decision' 'determin' 'dominant' 'domina' 'force' 'greedy'
 'headstrong' 'hierarch' 'hostil' 'implusive' 'independen' 'individual'
 'intellect' 'lead' 'logic' 'masculine' 'objective' 'opinion' 'outspoken'
 'persist' 'principle' 'reckless' 'stubborn' 'superior' 'self-confiden'
 'self-sufficien' 'self-relian' 'manmade' 'chairman' 'son' 'fireman'
 'freshman' 'man' 'mankind' 'manpower' 'boyfriend' 'husband' 'policeman'
 'walter' 'brother' 'spokesman' 'upperclassman' 'gentleman' 'alumnus'
 'alumni' 'man up' 'Mr.' 'man-made' 'the common man' 'mailman' 'steward'
 'actor' 'congressman' 'acts as a leader' 'aggressive' 'ambitious'
 'analytical' 'assertive' 'athletic' 'competitive' 'defends own beliefs'
 'forceful' 'has leadership abilities' 'independent' 'individualistic'
 'makes decisions easily']


In [21]:
print(female_words)

['affectionate' 'child' 'cheer' 'commit' 'communal' 'compassion' 'connect'
 'considerate' 'cooperat' 'depend' 'emotiona' 'empath' 'feminine'
 'flatterable' 'gentle' 'honest' 'interpersonal' 'interdependen'
 'interpersona' 'kind' 'kinship' 'loyal' 'modesty' 'nag' 'nurtur'
 'pleasant' 'polite' 'quiet' 'respon' 'sensitiv' 'submissive' 'support'
 'sympath' 'tender' 'together' 'trust' 'understand' 'warm' 'whin' 'yield'
 'daughter' 'wife' 'girlfriend' 'waitress' 'sister' 'ladies' 'alumna'
 'alumnae' 'hysterical' 'shrill' 'nagging' 'Mrs.' 'Miss.' 'Ms.'
 'stewardess' 'actress']


In [22]:
#start to test for checking male and female worsd

In [56]:
data = {'sentences': ['Must be an extrovert with an innate quality of easily connecting with people.', 
                      'You are self-motivated and decisive, but willing to make changes with minimal grumbling when the client demands it.', 
                      'We are looking for a young and driven candidate who can bring innovation into the organization.']}

df_test_sentence = pd.DataFrame(data)

In [61]:
df_test_sentence

Unnamed: 0,sentences
0,Must be an extrovert with an innate quality of...
1,"You are self-motivated and decisive, but willi..."
2,We are looking for a young and driven candidat...


## the following block is to fine out  in each setnece if there is any male or female words

In [59]:
df_check_result = pd.DataFrame()

for index, sentence in df_test_sentence.iterrows():
    temp_sentence = sentence['sentences']
    tokenized_sentence = clean(temp_sentence)
    
    # there will be columns named 'category', 'word_in_sentences', and 'biased_term' in 'df_check_result' 
    category = 'neutral' 
    word_in_sentence = 'None' # 
    word = 'None' # 
    
    # check for male words, and them put the outcome to 
    for male_word in male_words:
        if re.search(r"\b{}\b".format(male_word), temp_sentence.lower().strip()): # search for 'male_word' in 'temp_sentence' using RE
            # set output if 'male_word' is found
            category = 'masculine'
            word_in_sentence = male_word
            word = male_word
            #when there is no male word in the temp_sentence 
        else:
            for token in tokenized_sentence:
                if len(male_word) > 3:
                    if simple_clean(male_word) == token[:len(male_word)]: # check if the male_word is found at the beginning of the token
                        category = 'masculine'
                        word_in_sentence = token
                        word = male_word
                    elif simple_clean(male_word) == token[-len(male_word):]: # check if the male_word is found at the end of the token
                        category = 'masculine'
                        word_in_sentence = token
                        word = male_word
            
    if category == 'masculine': # put the outcome in a dict, then append them to 'df_check_result'
        dict = {'sentence': temp_sentence,
                'word_in_Sentence': word_in_sentence,
                'biased_term': word,
                'category': category
               }
        #df_check_result = df_check_result.append(dict, ignore_index = True)
        df_check_result = pd.concat([df_check_result, pd.DataFrame([dict])], ignore_index=True)

        
        
    # the completely same process for checking for female words
    for female_word in female_words:
        if re.search(r"\b{}\b".format(female_word), temp_sentence.lower().strip()):
            category = 'feminine'
            word_in_sentence = female_word
            word = female_word
        else:
            for token in tokenized_sentence:
                if len(female_word) > 3:
                    if simple_clean(female_word) == token[:len(female_word)]:
                        category = 'feminine'
                        word_in_sentence = token
                        word = female_word
                    elif simple_clean(female_word) == token[-len(female_word):]:
                        category = 'feminine'
                        word_in_sentence = token
                        word = female_word
                    
    if category == 'feminine':
        dict = {'sentence': temp_sentence,
                'word_in_Sentence': word_in_sentence,
                'biased_term': word,
                'category': category
               }
        #df_check_result = df_check_result.append(dict, ignore_index = True)
        df_check_result = pd.concat([df_check_result, pd.DataFrame([dict])], ignore_index=True)

    
    
    # This is to roughly monitor how many lines/sentences have been proceeded when running this block of code
    if index%10000 == 0:
        print(index)

0


In [63]:
# all the reult frome checking male/female words are now put into 'df_check_result'
df_check_result

Unnamed: 0,sentence,word_in_Sentence,biased_term,category
0,Must be an extrovert with an innate quality of...,connect,connect,feminine
1,"You are self-motivated and decisive, but willi...",decisive,decisive,masculine
