# Data Cleaning and Preparing

## Step 1: initialization 

In [None]:
import pandas as pd
import numpy as np

import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
#initialize the lematizer and stemmer, which will be used later.
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

#to be used in the cleaning function
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

print(stopwords.words('english'))
#print(stopwords.words('chinese'))

## Step 2: Clean the Data
The dataset, EMSCAD, is downloaded from Kaggle, the link is here:
https://www.kaggle.com/datasets/amruthjithrajvr/recruitment-scam

In [None]:
#load the EMSCAD dataset
df = pd.read_csv('./DataSet.csv')
print("The shape of the dataframe is",df.shape) 

### Get the dataframe for only the Job description column.

In [None]:
df_jd = pd.DataFrame(df['description'])
print("The shape of the dataframe is", df_jd.shape) 
print("Display one Job description sample:\n") 
print(df_jd['description'][0]) 

### To display the Job description sample in a more readable way.

In [None]:
from IPython.core.display import display, HTML
display(HTML(df_jd['description'][0]))

### Prepare the cleaning function

In [None]:
# remove the HTML tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)


def clean(text):
    
    # remove the HTML tags
    text = striphtml(text)
    
    # Lowercase text
    text = text.lower()
    
    # Remove punctuation
    text = text.replace(':', ' ')
    text = text.replace('\'', ' ')
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Remove extra spaces from text
    text = " ".join(text.split())
    
    # Remove stopwords function
    # Tokenize : get a list of tokens
    stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
    word_tokens = word_tokenize(text)
    text = [word for word in word_tokens if word not in stop_words]
    
    # Lemmatize words
    text = [lemmatizer.lemmatize(word, pos ='v') for word in text]
    
    # Stem words
    text = [stemmer.stem(word) for word in text]
    
    return text

In [None]:
def simple_clean(text):
    # Lowercase text
    text = text.lower()
    
    # Remove numbers
    #text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text

In [None]:
# for now, I only used some of the examples (df_jd['description'][1:100]), due to the computing capability.

for index, sentence in df_jd['description'][1:2].iteritems(): 
    #df_jd_sentence.add(clean(sentence))
    print(clean(sentence))


##  For the purpose of demonstration, I did a cleaning process step by step

In [None]:
# define new variables, "text_jd" and "text_jd_without_stopwords", which will only be used here for demonstration
# and only use one job description as example.
text_jd = df_jd['description'][0]
print(text_jd)

In [None]:
# remove the HTML tags
text_jd = striphtml(text_jd)
print(text_jd)

In [None]:
# Lowercase text
text_jd = text_jd.lower()
print(text_jd)

In [None]:
# Remove ':', '\', and punctuation
text_jd = text_jd.replace(':', ' ')
text_jd = text_jd.replace('\'', ' ')
translator = str.maketrans('', '', string.punctuation)
text_jd = text_jd.translate(translator)
print(text_jd)

In [None]:
# Remove extra spaces from text
text_jd = " ".join(text_jd.split())
print(text_jd)

In [None]:
# set stopwords 
stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
print(stop_words)

In [None]:
# Tokenize : get a list of tokens
# Remove stop words
word_tokens = word_tokenize(text_jd)
text_jd_without_stopwords = [word for word in word_tokens if word not in stop_words]
print(text_jd_without_stopwords)

In [None]:
# Lemmatize words
text_jd_without_stopwords = [lemmatizer.lemmatize(word, pos ='v') for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

In [None]:
# Stem words 
text_jd_without_stopwords = [stemmer.stem(word) for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

# Step 2: preparing the data

In [None]:
# get the list of biased words or phrases
df_biased_words = pd.read_excel("./bias_words.xlsx")
print(df_biased_words.shape)
print(df_biased_words.head())
print(df_biased_words['Masculine/Feminine Bias'].value_counts())

In [None]:
# exclude generic he/she and dupulicated words in df_biased_words
words_to_exclude = ['she', 'her', 'hers', 'herself', 'he', 'himself', 'him', 'his']
df_biased_words = df_biased_words[~df_biased_words['Biased Words or Phrases'].isin(words_to_exclude)]
df_biased_words = df_biased_words.drop_duplicates()

print(df_biased_words.shape)
print(df_biased_words.isna().any())# check if there's any empty cells
print(df_biased_words.head())
print(df_biased_words['Masculine/Feminine Bias'].value_counts())

In [None]:
# get the list of male_words and female words
male_words = (df_biased_words.loc[df_biased_words['Masculine/Feminine Bias'] == 'Masculine Bias'])['Biased Words or Phrases'].values
female_words = (df_biased_words.loc[df_biased_words['Masculine/Feminine Bias'] == 'Feminine Bias'])['Biased Words or Phrases'].values

In [None]:
print(male_words)

In [None]:
print(female_words)

In [None]:
#start to test for checking male and female worsd

In [None]:
data = {'sentences': ['Must be an extrovert with an innate quality of easily connecting with people.', 
                      'You are self-motivated and decisive, but willing to make changes with minimal grumbling when the client demands it.', 
                      'We are looking for a young and driven candidate who can bring innovation into the organization.']}

df_test_sentence = pd.DataFrame(data)

In [None]:
df_test_sentence

## The following block is to find out in each sentence if there is any male or female words

In [None]:
df_check_result = pd.DataFrame()

for index, sentence in df_test_sentence.iterrows():
    temp_sentence = sentence['sentences']
    tokenized_sentence = clean(temp_sentence)
    
    # there will be columns named 'category', 'word_in_sentences', and 'biased_term' in 'df_check_result' 
    category = 'neutral' 
    word_in_sentence = 'None' # 
    word = 'None' # 
    
    # check for male words, and them put the outcome to 
    for male_word in male_words:
        if re.search(r"\b{}\b".format(male_word), temp_sentence.lower().strip()): # search for 'male_word' in 'temp_sentence' using RE
            # set output if 'male_word' is found
            category = 'masculine'
            word_in_sentence = male_word
            word = male_word
            #when there is no male word in the temp_sentence 
        else:
            for token in tokenized_sentence:
                if len(male_word) > 3:
                    if simple_clean(male_word) == token[:len(male_word)]: # check if the male_word is found at the beginning of the token
                        category = 'masculine'
                        word_in_sentence = token
                        word = male_word
                    elif simple_clean(male_word) == token[-len(male_word):]: # check if the male_word is found at the end of the token
                        category = 'masculine'
                        word_in_sentence = token
                        word = male_word
            
    if category == 'masculine': # put the outcome in a dict, then append them to 'df_check_result'
        dict = {'sentence': temp_sentence,
                'word_in_Sentence': word_in_sentence,
                'biased_term': word,
                'category': category
               }
        #df_check_result = df_check_result.append(dict, ignore_index = True)
        df_check_result = pd.concat([df_check_result, pd.DataFrame([dict])], ignore_index=True)

        
        
    # the completely same process for checking for female words
    for female_word in female_words:
        if re.search(r"\b{}\b".format(female_word), temp_sentence.lower().strip()):
            category = 'feminine'
            word_in_sentence = female_word
            word = female_word
        else:
            for token in tokenized_sentence:
                if len(female_word) > 3:
                    if simple_clean(female_word) == token[:len(female_word)]:
                        category = 'feminine'
                        word_in_sentence = token
                        word = female_word
                    elif simple_clean(female_word) == token[-len(female_word):]:
                        category = 'feminine'
                        word_in_sentence = token
                        word = female_word
                    
    if category == 'feminine':
        dict = {'sentence': temp_sentence,
                'word_in_Sentence': word_in_sentence,
                'biased_term': word,
                'category': category
               }
        #df_check_result = df_check_result.append(dict, ignore_index = True)
        df_check_result = pd.concat([df_check_result, pd.DataFrame([dict])], ignore_index=True)

    
    
    # This is to roughly monitor how many lines/sentences have been proceeded when running this block of code
    #if index%10000 == 0:
        #print(f'{index} sentences have been processed.' )
        
    print(f'{index + 1} sentences have been processed.' )

In [None]:
# all the reult frome checking male/female words are now put into 'df_check_result'
df_check_result

## This block is to find out male or female words in sentences as well, only to store outcomes in different form

In [None]:
# I'm using df_jd_sentences to store sentences extracted from EMSCAD dataset.

In [None]:
df_new = pd.DataFrame()

for index, sentence in df_test_sentence.iterrows():
    temp_sentence = sentence['sentences']
    tokenized_sentence = clean(temp_sentence)
    words_in_sentence = []
    words = []
    
    if len(temp_sentence) < 180 and temp_sentence[0].isupper():
        # check for male words
        for male_word in male_words:
            if re.search(r"\b{}\b".format(male_word), temp_sentence.lower().strip()):
                words_in_sentence.append([male_word, 'M'])
                words.append([male_word, 'M'])
            else:
                for token in tokenized_sentence:
                    if len(male_word) > 3:
                        if simple_clean(male_word) == token[:len(male_word)]:
                            words_in_sentence.append([token, 'M'])
                            words.append([male_word, 'M'])
                        elif simple_clean(male_word) == token[-len(male_word):]:
                            if token[:len(male_word)] != token[-len(male_word):]:
                                words_in_sentence.append([token, 'M'])
                                words.append([male_word, 'M'])

        # check for female words
        for female_word in female_words:
            if re.search(r"\b{}\b".format(female_word), temp_sentence.lower().strip()):
                words_in_sentence.append([female_word, 'F'])
                words.append([male_word, 'F'])
            else:
                for token in tokenized_sentence:
                    if len(female_word) > 3:
                        if simple_clean(female_word) == token[:len(female_word)]:
                            words_in_sentence.append([token, 'F'])
                            words.append([female_word, 'F'])
                        elif simple_clean(female_word) == token[-len(female_word):]:
                            if token[:len(female_word)] != token[-len(female_word):]:
                                words_in_sentence.append([token, 'F'])
                                words.append([female_word, 'F'])

        if len(words) > 0:
            dict = {'sentence': temp_sentence,
                    'word_in_Sentence': words_in_sentence,
                    'biased_term': words}
            df_new = df_new.append(dict, ignore_index = True)

        if index%10000 == 0:
            print(index)

In [None]:
df_new

In [None]:
def clean_sentence(sentence):
    temp = " ".join(sentence.split())
    temp = temp.strip()

    char_to_remove = 0
    for x in temp.split()[0]:
        if not x.isalpha():
            char_to_remove += 1
    temp = temp[char_to_remove: len(temp) - char_to_remove]
    return sentence

'''
In Pandas, the map() method is used to apply a function to every element of a Series object
or a column of a DataFrame object. The map() method takes a function as an argument and applies
it to each element of the Series or column, returning a new Series or column with the results of
the function applied to each element.
'''

#cleaning the values in the 'sentence' column of 'df_new' using the 'clean_sentence' function
df_new['sentence'] = df_new.sentence.map(lambda x:clean_sentence(x))

In [76]:
df_new.head()

Unnamed: 0,sentence,word_in_Sentence,biased_term
0,Must be an extrovert with an innate quality of...,"[[connect, F]]","[[connect, F]]"
1,"You are self-motivated and decisive, but willi...","[[decisive, M]]","[[decisive, M]]"


### explore filtered sentences

### save data