# Data Cleaning and Preparing

## Step 1: initialization 

In [9]:
import pandas as pd
import numpy as np

import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [18]:
#initialize the lematizer and stemmer, which will be used later.
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

#to be used in the cleaning function
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

print(stopwords.words('english'))
#print(stopwords.words('chinese'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Step 2: Clean the Data
The dataset, EMSCAD, is downloaded from Kaggle, the link is here:
https://www.kaggle.com/datasets/amruthjithrajvr/recruitment-scam

In [11]:
#load the EMSCAD dataset
df = pd.read_csv('./DataSet.csv')
print("The shape of the dataframe is",df.shape) 

The shape of the dataframe is (17880, 18)


### Get the dataframe for only the Job description column.

In [12]:
df_jd = pd.DataFrame(df['description'])
print("The shape of the dataframe is", df_jd.shape) 
print("Display one Job description sample:\n") 
print(df_jd['description'][0]) 

The shape of the dataframe is (17880, 1)
Display one Job description sample:

<p>Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.</p>
<ul>
<li>Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems</li>
<li>Researching blogs and websites for the Provisions by Food52 Affiliate Program</li>
<li>Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries</li>
<li>Supporting with PR &amp; Events when needed</li>
<li>Helping with office administrative work, such as filing, mailing, and preparing for meetings</li>
<li>Working with developers to document bugs and suggest improvement

### To display the Job description sample in a more readable way.

In [13]:
from IPython.core.display import display, HTML
display(HTML(df_jd['description'][0]))

### Prepare the cleaning function

In [23]:
# remove the HTML tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)


def clean(text):
    
    # remove the HTML tags
    text = striphtml(text)
    
    # Lowercase text
    text = text.lower()
    
    # Remove punctuation
    text = text.replace(':', ' ')
    text = text.replace('\'', ' ')
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Remove extra spaces from text
    text = " ".join(text.split())
    
    # Remove stopwords function
    # Tokenize : get a list of tokens
    stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
    word_tokens = word_tokenize(text)
    text = [word for word in word_tokens if word not in stop_words]
    
    # Lemmatize words
    text = [lemmatizer.lemmatize(word, pos ='v') for word in text]
    
    # Stem words
    text = [stemmer.stem(word) for word in text]
    
    return text

In [24]:
# for now, I only used some of the examples (df_jd['description'][1:100]), due to the computing capability.

df_jd_sentence = pd.DataFrame()

for index, sentence in df_jd['description'][1:100].iteritems(): 
    #df_jd_sentence.add(clean(sentence))
    print(clean(sentence))


['organis', 'focu', 'vibrant', 'awesomedo', 'passion', 'custom', 'servic', 'slick', 'type', 'skill', 'mayb', 'account', 'manag', 'think', 'administr', 'cooler', 'polar', 'bear', 'jetski', 'need', 'hear', 'cloud', 'video', 'product', 'servic', 'opper', 'glodal', 'level', 'yeah', 'pretti', 'cool', 'seriou', 'deliv', 'world', 'class', 'product', 'excel', 'custom', 'serviceour', 'rapidli', 'expand', 'busi', 'look', 'talent', 'project', 'manag', 'manag', 'success', 'deliveri', 'video', 'project', 'manag', 'client', 'commun', 'drive', 'product', 'process', 'work', 'coolest', 'brand', 'planet', 'learn', 'global', 'team', 'repres', 'nz', 'huge', 'way', 'enter', 'next', 'growth', 'stage', 'busi', 'grow', 'quickli', 'intern', 'therefor', 'posit', 'burst', 'opportun', 'right', 'person', 'enter', 'busi', 'right', 'time', '90', 'second', 'world', 'cloud', 'video', 'product', 'servic', 'http', '90urlfbe6559afac620a3cd2c22281f7b8d0eef56a73e3d9a311e2f1ca13d081dd630', '90', 'second', 'world', 'cloud', 

['love', 'client', 'get', 'conduit', 'commun', 'love', 'feel', 'like', 'magic', 'someth', 'beauti', 'creat', 'solv', 'busi', 'goal', 'meet', 'user', '’', 'need', 'love', 'work', 'design', 'across', 'varieti', 'project', '’', 'shi', 'away', 'difficult', 'convers', 'natur', 'mentor', 'creat', 'visual', 'design', 'fastpac', 'environ', 'mean', 'bring', 'strong', 'voic', 'effici', 'speed', 'eye', 'critiqu', 'environ', 'call', 'extrem', 'commun', 'enthusiasm', 'empathi', 'art', 'director', 'understand', 'time', 'diva', 'wannab', 'superhero', 'understand', 'success', 'determin', 'strength', 'team', 'whole', '’', 'essenti', 'member', 'carri', 'hisher', 'weight', 'art', 'director', '’', 'work', 'close', 'client', 'across', 'varieti', 'vertic', 'consum', 'enterpris', 'space', 'project', 'push', 'outsid', 'comfort', 'zone', 'industri', 'experi', 'util', 'help', 'shape', 'futur', 'mutual', 'mobil', '’', 'design', 'team', '’', 'keep', 'busi', 'lead', 'work', 'across', '1', '2', 'key', 'account', 'd

['name', 'agil', 'startup', 'dedic', 'build', 'endtoend', 'human', 'capit', 'manag', 'platform', 'busi', 'size', '–', 'client', 'like', 'birchbox', 'hailo', 'amp', 'warbi', 'parker', 'use', 'name', 'hr', 'manag', 'user', 'vari', 'hr', 'administr', 'manag', 'averag', 'employe', 'client', 'cultur', 'polici']
['gbi', 'grow', 'compani', 'develop', 'sever', 'cut', 'edg', 'system', 'financi', 'industri', 'set', 'embark', 'excit', 'new', 'integr', 'new', 'partner', 'seek', 'individu', 'lead', 'execut', 'new', 'project', 'design', 'implement', 'support', 'selfmotiv', 'detailori', 'energet', 'passion', 'lead', 'facet', 'softwar', 'develop', 'project', 'build', 'team', 'bring', 'bear', 'compani', 'descript', 'highli', 'motiv', 'individu', 'quickli', 'come', 'speed', 'understand', 'support', 'exist', 'applic', 'experi', 'busi', 'analysi', 'requir', 'gather', 'function', 'specif', 'test', 'support', 'new', 'exist', 'applic', 'demonstr', 'abil', 'work', 'user', 'goto', 'person', 'applic', 'relat', 

['administr', 'assist', 'base', 'san', 'francisco', 'ca', 'right', 'candid', 'integr', 'part', 'talent', 'team', 'support', 'continu', 'growth', 'respons', 'attend', 'meet', 'order', 'record', 'minut', 'compil', 'transcrib', 'distribut', 'minut', 'meet', 'conduct', 'research', 'compil', 'data', 'prepar', 'paper', 'consider', 'present', 'execut', 'committe', 'board', 'director', 'coordin', 'direct', 'offic', 'servic', 'meet', 'individu', 'special', 'interest', 'group', 'other', 'review', 'oper', 'practic', 'procedur', 'order', 'determin', 'whether', 'improv', 'make', 'area', 'workflow', 'report', 'procedur', 'expenditur', 'provid', 'secretari', 'support', 'profession', 'supervisor', 'manag', 'case', 'subordin', 'staff', 'individu', 'provid', 'highlyskil', 'keyboard', 'support', 'transcrib', 'record', 'inform']
['fantast', 'opportun', 'someon', 'want', 'start', 'career', 'custom', 'servic', 'first', '12', 'month', 'work', 'toward', 'level', '2', 'custom', 'servic', 'nvq', 'keep', 'perman

##  For the purpose of demonstration, I did a cleaning process step by step

In [25]:
# define new variables, "text_jd" and "text_jd_without_stopwords", which will only be used here for demonstration
# and only use one job description as example.
text_jd = df_jd['description'][0]
print(text_jd)

<p>Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.</p>
<ul>
<li>Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems</li>
<li>Researching blogs and websites for the Provisions by Food52 Affiliate Program</li>
<li>Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries</li>
<li>Supporting with PR &amp; Events when needed</li>
<li>Helping with office administrative work, such as filing, mailing, and preparing for meetings</li>
<li>Working with developers to document bugs and suggest improvements to the site</li>
<li>Supporting the marketing and executive staff</li>
</u

In [26]:
# remove the HTML tags
text_jd = striphtml(text_jd)
print(text_jd)

Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.

Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systems
Researching blogs and websites for the Provisions by Food52 Affiliate Program
Assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries
Supporting with PR &amp; Events when needed
Helping with office administrative work, such as filing, mailing, and preparing for meetings
Working with developers to document bugs and suggest improvements to the site
Supporting the marketing and executive staff



In [27]:
# Lowercase text
text_jd = text_jd.lower()
print(text_jd)

food52, a fast-growing, james beard award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its new york city headquarters.

reproducing and/or repackaging existing food52 content for a number of partner sites, such as huffington post, yahoo, buzzfeed, and more in their various content management systems
researching blogs and websites for the provisions by food52 affiliate program
assisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiries
supporting with pr &amp; events when needed
helping with office administrative work, such as filing, mailing, and preparing for meetings
working with developers to document bugs and suggest improvements to the site
supporting the marketing and executive staff



In [28]:
# Remove ':', '\', and punctuation
text_jd = text_jd.replace(':', ' ')
text_jd = text_jd.replace('\'', ' ')
translator = str.maketrans('', '', string.punctuation)
text_jd = text_jd.translate(translator)
print(text_jd)

food52 a fastgrowing james beard awardwinning online food community and crowdsourced and curated recipe hub is currently interviewing full and parttime unpaid interns to work in a small team of editors executives and developers in its new york city headquarters

reproducing andor repackaging existing food52 content for a number of partner sites such as huffington post yahoo buzzfeed and more in their various content management systems
researching blogs and websites for the provisions by food52 affiliate program
assisting in daytoday affiliate program support such as screening affiliates and assisting in any affiliate inquiries
supporting with pr amp events when needed
helping with office administrative work such as filing mailing and preparing for meetings
working with developers to document bugs and suggest improvements to the site
supporting the marketing and executive staff



In [29]:
# Remove extra spaces from text
text_jd = " ".join(text_jd.split())
print(text_jd)

food52 a fastgrowing james beard awardwinning online food community and crowdsourced and curated recipe hub is currently interviewing full and parttime unpaid interns to work in a small team of editors executives and developers in its new york city headquarters reproducing andor repackaging existing food52 content for a number of partner sites such as huffington post yahoo buzzfeed and more in their various content management systems researching blogs and websites for the provisions by food52 affiliate program assisting in daytoday affiliate program support such as screening affiliates and assisting in any affiliate inquiries supporting with pr amp events when needed helping with office administrative work such as filing mailing and preparing for meetings working with developers to document bugs and suggest improvements to the site supporting the marketing and executive staff


In [30]:
# set stopwords 
stop_words = set(stopwords.words("english")) # nltk.download('stopwords') - this is done at the begining
print(stop_words)

{'because', 'myself', 'any', "weren't", 're', "couldn't", 'during', 'are', "she's", 'this', "needn't", 'what', 'nor', 'who', 'was', 'all', 'over', 'didn', 'isn', 'while', 'is', 'him', 'such', 'then', 'own', 'mustn', 'they', 'only', "won't", 'will', 'before', 'below', 'now', "didn't", 'ma', 'you', 'until', "mustn't", 'yours', 'to', 'from', 'theirs', 'had', 'and', 'off', 'that', "it's", 'few', 'he', 'our', 'd', 'these', 'being', 'same', 'a', 'at', 'both', 'having', 'ain', "isn't", 'shan', "you'll", 'weren', 'when', "shan't", 'herself', 'were', 'of', "you've", 'most', 'his', 'too', "aren't", 'has', 'doesn', 'mightn', 'by', "shouldn't", "hasn't", 'wouldn', 'as', 'i', 'been', 'some', "mightn't", "that'll", 'ourselves', "you'd", 'yourself', 'through', 'once', 'am', 'down', "doesn't", 'themselves', 'won', 'my', 'where', 's', 'again', 'further', 'above', 'does', 'under', 'itself', 'an', 'about', 'o', 'just', "should've", 'on', 'doing', 'don', 't', 'your', 'those', 'we', 'each', 'other', 'betwe

In [31]:
# Tokenize : get a list of tokens
# Remove stop words
word_tokens = word_tokenize(text_jd)
text_jd_without_stopwords = [word for word in word_tokens if word not in stop_words]
print(text_jd_without_stopwords)

['food52', 'fastgrowing', 'james', 'beard', 'awardwinning', 'online', 'food', 'community', 'crowdsourced', 'curated', 'recipe', 'hub', 'currently', 'interviewing', 'full', 'parttime', 'unpaid', 'interns', 'work', 'small', 'team', 'editors', 'executives', 'developers', 'new', 'york', 'city', 'headquarters', 'reproducing', 'andor', 'repackaging', 'existing', 'food52', 'content', 'number', 'partner', 'sites', 'huffington', 'post', 'yahoo', 'buzzfeed', 'various', 'content', 'management', 'systems', 'researching', 'blogs', 'websites', 'provisions', 'food52', 'affiliate', 'program', 'assisting', 'daytoday', 'affiliate', 'program', 'support', 'screening', 'affiliates', 'assisting', 'affiliate', 'inquiries', 'supporting', 'pr', 'amp', 'events', 'needed', 'helping', 'office', 'administrative', 'work', 'filing', 'mailing', 'preparing', 'meetings', 'working', 'developers', 'document', 'bugs', 'suggest', 'improvements', 'site', 'supporting', 'marketing', 'executive', 'staff']


In [32]:
# Lemmatize words
text_jd_without_stopwords = [lemmatizer.lemmatize(word, pos ='v') for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

['food52', 'fastgrowing', 'jam', 'beard', 'awardwinning', 'online', 'food', 'community', 'crowdsourced', 'curated', 'recipe', 'hub', 'currently', 'interview', 'full', 'parttime', 'unpaid', 'intern', 'work', 'small', 'team', 'editors', 'executives', 'developers', 'new', 'york', 'city', 'headquarter', 'reproduce', 'andor', 'repackaging', 'exist', 'food52', 'content', 'number', 'partner', 'sit', 'huffington', 'post', 'yahoo', 'buzzfeed', 'various', 'content', 'management', 'systems', 'research', 'blog', 'websites', 'provision', 'food52', 'affiliate', 'program', 'assist', 'daytoday', 'affiliate', 'program', 'support', 'screen', 'affiliate', 'assist', 'affiliate', 'inquiries', 'support', 'pr', 'amp', 'events', 'need', 'help', 'office', 'administrative', 'work', 'file', 'mail', 'prepare', 'meet', 'work', 'developers', 'document', 'bug', 'suggest', 'improvements', 'site', 'support', 'market', 'executive', 'staff']


In [33]:
# Stem words 
text_jd_without_stopwords = [stemmer.stem(word) for word in text_jd_without_stopwords]
print(text_jd_without_stopwords)

['food52', 'fastgrow', 'jam', 'beard', 'awardwin', 'onlin', 'food', 'commun', 'crowdsourc', 'curat', 'recip', 'hub', 'current', 'interview', 'full', 'parttim', 'unpaid', 'intern', 'work', 'small', 'team', 'editor', 'execut', 'develop', 'new', 'york', 'citi', 'headquart', 'reproduc', 'andor', 'repackag', 'exist', 'food52', 'content', 'number', 'partner', 'sit', 'huffington', 'post', 'yahoo', 'buzzfe', 'variou', 'content', 'manag', 'system', 'research', 'blog', 'websit', 'provis', 'food52', 'affili', 'program', 'assist', 'daytoday', 'affili', 'program', 'support', 'screen', 'affili', 'assist', 'affili', 'inquiri', 'support', 'pr', 'amp', 'event', 'need', 'help', 'offic', 'administr', 'work', 'file', 'mail', 'prepar', 'meet', 'work', 'develop', 'document', 'bug', 'suggest', 'improv', 'site', 'support', 'market', 'execut', 'staff']


# Step 2: preparing the data

In [34]:
# get the list of biased words or phrases
df_biased_words = pd.read_excel("./bias_words.xlsx")
print(df_biased_words.shape)
print(df_biased_words.head())
print(df_biased_words['Masculine/Feminine Bias'].value_counts())

(159, 2)
  Biased Words or Phrases Masculine/Feminine Bias
0                  active          Masculine Bias
1             adventurous          Masculine Bias
2                 aggress          Masculine Bias
3                 ambitio          Masculine Bias
4                   analy          Masculine Bias
Masculine Bias    95
Feminine Bias     64
Name: Masculine/Feminine Bias, dtype: int64


In [35]:
# exclude generic he/she and dupulicated words in df_biased_words
words_to_exclude = ['she', 'her', 'hers', 'herself', 'he', 'himself', 'him', 'his']
df_biased_words = df_biased_words[~df_biased_words['Biased Words or Phrases'].isin(words_to_exclude)]
df_biased_words = df_biased_words.drop_duplicates()

print(df_biased_words.shape)
print(df_biased_words.isna().any())# check if there's any empty cells
print(df_biased_words.head())
print(df_biased_words['Masculine/Feminine Bias'].value_counts())

(137, 2)
Biased Words or Phrases    False
Masculine/Feminine Bias    False
dtype: bool
  Biased Words or Phrases Masculine/Feminine Bias
0                  active          Masculine Bias
1             adventurous          Masculine Bias
2                 aggress          Masculine Bias
3                 ambitio          Masculine Bias
4                   analy          Masculine Bias
Masculine Bias    81
Feminine Bias     56
Name: Masculine/Feminine Bias, dtype: int64


In [36]:
# get the list of male_words and female words
male_words = (df_biased_words.loc[df_biased_words['Masculine/Feminine Bias'] == 'Masculine Bias'])['Biased Words or Phrases'].values
female_words = (df_biased_words.loc[df_biased_words['Masculine/Feminine Bias'] == 'Feminine Bias'])['Biased Words or Phrases'].values

In [37]:
print(male_words)

['active' 'adventurous' 'aggress' 'ambitio' 'analy' 'assert' 'athlet'
 'autonom' 'boast' 'challeng' 'compet' 'confident' 'courag' 'decide'
 'decisive' 'decision' 'determin' 'dominant' 'domina' 'force' 'greedy'
 'headstrong' 'hierarch' 'hostil' 'implusive' 'independen' 'individual'
 'intellect' 'lead' 'logic' 'masculine' 'objective' 'opinion' 'outspoken'
 'persist' 'principle' 'reckless' 'stubborn' 'superior' 'self-confiden'
 'self-sufficien' 'self-relian' 'manmade' 'chairman' 'son' 'fireman'
 'freshman' 'man' 'mankind' 'manpower' 'boyfriend' 'husband' 'policeman'
 'walter' 'brother' 'spokesman' 'upperclassman' 'gentleman' 'alumnus'
 'alumni' 'man up' 'Mr.' 'man-made' 'the common man' 'mailman' 'steward'
 'actor' 'congressman' 'acts as a leader' 'aggressive' 'ambitious'
 'analytical' 'assertive' 'athletic' 'competitive' 'defends own beliefs'
 'forceful' 'has leadership abilities' 'independent' 'individualistic'
 'makes decisions easily']


In [38]:
print(female_words)

['affectionate' 'child' 'cheer' 'commit' 'communal' 'compassion' 'connect'
 'considerate' 'cooperat' 'depend' 'emotiona' 'empath' 'feminine'
 'flatterable' 'gentle' 'honest' 'interpersonal' 'interdependen'
 'interpersona' 'kind' 'kinship' 'loyal' 'modesty' 'nag' 'nurtur'
 'pleasant' 'polite' 'quiet' 'respon' 'sensitiv' 'submissive' 'support'
 'sympath' 'tender' 'together' 'trust' 'understand' 'warm' 'whin' 'yield'
 'daughter' 'wife' 'girlfriend' 'waitress' 'sister' 'ladies' 'alumna'
 'alumnae' 'hysterical' 'shrill' 'nagging' 'Mrs.' 'Miss.' 'Ms.'
 'stewardess' 'actress']


In [39]:
'''

df_new = pd.DataFrame()

for index, sentence in df_sentences.iterrows():
    temp_sentence = sentence['sentences']
    tokenized_sentence = clean(temp_sentence)
    category = 'neutral'
    word_in_sentence = 'None'
    word = 'None'
    
    # check for male words
    for male_word in male_words:
        if re.search(r"\b{}\b".format(male_word), temp_sentence.lower().strip()):
            category = 'masculine'
            word_in_sentence = male_word
            word = male_word
        else:
            for token in tokenized_sentence:
                if len(male_word) > 3:

                    if simple_clean(male_word) == token[:len(male_word)]:
                        category = 'masculine'
                        word_in_sentence = token
                        word = male_word
                    elif simple_clean(male_word) == token[-len(male_word):]:
                        category = 'masculine'
                        word_in_sentence = token
                        word = male_word
            
    if category == 'masculine':
        dict = {'sentence': temp_sentence,
                'word_in_Sentence': word_in_sentence,
                'biased_term': word,
                'category': category
               }
        df_new = df_new.append(dict, ignore_index = True)
        
    # check for female words
    for female_word in female_words:
        if re.search(r"\b{}\b".format(female_word), temp_sentence.lower().strip()):
            category = 'feminine'
            word_in_sentence = female_word
            word = female_word
        else:
            for token in tokenized_sentence:
                if len(female_word) > 3:
                    if simple_clean(female_word) == token[:len(female_word)]:
                        category = 'feminine'
                        word_in_sentence = token
                        word = female_word
                    elif simple_clean(female_word) == token[-len(female_word):]:
                        category = 'feminine'
                        word_in_sentence = token
                        word = female_word
                    
    if category == 'feminine':
        dict = {'sentence': temp_sentence,
                'word_in_Sentence': word_in_sentence,
                'biased_term': word,
                'category': category
               }
        df_new = df_new.append(dict, ignore_index = True)
    
    if index%10000 == 0:
        print(index)
        
        
        
'''

'\n\ndf_new = pd.DataFrame()\n\nfor index, sentence in df_sentences.iterrows():\n    temp_sentence = sentence[\'sentences\']\n    tokenized_sentence = clean(temp_sentence)\n    category = \'neutral\'\n    word_in_sentence = \'None\'\n    word = \'None\'\n    \n    # check for male words\n    for male_word in male_words:\n        if re.search(r"\x08{}\x08".format(male_word), temp_sentence.lower().strip()):\n            category = \'masculine\'\n            word_in_sentence = male_word\n            word = male_word\n        else:\n            for token in tokenized_sentence:\n                if len(male_word) > 3:\n\n                    if simple_clean(male_word) == token[:len(male_word)]:\n                        category = \'masculine\'\n                        word_in_sentence = token\n                        word = male_word\n                    elif simple_clean(male_word) == token[-len(male_word):]:\n                        category = \'masculine\'\n                        word_in_

In [40]:
# try the sent_tokenize() function
paragraph = "This is the first sentence. This is the second sentence! This is the third sentence? And this is the fourth sentence."
sentences = sent_tokenize(paragraph)
words = word_tokenize(paragraph)
print(sentences)
print(words)

['This is the first sentence.', 'This is the second sentence!', 'This is the third sentence?', 'And this is the fourth sentence.']
['This', 'is', 'the', 'first', 'sentence', '.', 'This', 'is', 'the', 'second', 'sentence', '!', 'This', 'is', 'the', 'third', 'sentence', '?', 'And', 'this', 'is', 'the', 'fourth', 'sentence', '.']
