# Word Embedding Analysis with scraped data from Indeed Australia

In [None]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import nltk
from nltk import word_tokenize 
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [None]:
#import the dataset and drop douplicates 
data = pd.read_csv('DF_INDEED_CLEAN.csv', encoding='latin1')
df = data.drop_duplicates()
df = pd.DataFrame(df)
df.shape

In [None]:
# preprocessing and cleaning the data 
def cleanData(desc):
    desc = word_tokenize(desc)
    desc = [word.lower() for word in desc if word.isalpha() and len(word) > 2]
    desc = [word for word in desc if word not in stop_words]
    desc = [word for word in desc if word not in stop_words_add]
    return desc

In [None]:
# get stopwords and define other stopwords if needed 
stop_words = stopwords.words('english')
stop_words_add = ['â']
stop_words.extend(stop_words_add)

#apply to own data 
tags_df = df["Description"].apply(cleanData)

In [None]:
#make seperate dataframe with only cleaned descriptions 
tags = pd.DataFrame(tags_df)
tags = tags.rename(columns = {'Description': 'des_clean'}, inplace = False)
#not a smart way to do this but it cleans it and adds new column 
df_tags = df.join(tags, how="outer")

## Creating gender bias dictionary 

In [None]:
unique_words_dic = {}
def calculate_gender_bias_dictionary(df_column, w2vmodel, word1, word2):
#df_column = descriptions, w2vmodel = our model of choice, word1/2 = gender identifiers 
        model = w2vmodel
        male_word = word1
        female_word = word2
# Join all job descriptions
        all_words = ' '.join(df_column)
# Finds all unique words in the "big word"
        unique_words = set(all_words.split(' '))
# Create a dictionary with all unique words with gender bias values
        for word in unique_words:
            if word not in model.vocab.keys(): 
                unique_words_dic[word] = float(-1000.0) #assign -1000 if not in dictionary 
            else:
                male_sim = float(w2vmodel.similarity(word, word1)) 
                female_sim = float(w2vmodel.similarity(word, word2)) 
                difference = male_sim - female_sim 
                unique_words_dic[word] = float(difference)
        return unique_words_dic

In [None]:
#stupid way of doing this, but had problems with the two different formats in the datasets 
df_kol = list(set([a for b in df_tags.des_clean.tolist() for a in b]))
df_column = df_kol

In [None]:
#specify model to use 
w2vmodel = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=500000)
#speficy gender identifiers 
word1 = "man"
word2 = "woman"

In [None]:
gender_bias_dict = calculate_gender_bias_dictionary(df_column, w2vmodel, word1, word2)

In [None]:
#define function that calculates for each job description 
def calculate_gender_bias(annonce, gender_bias_dict): 
    gender_bias_total = 0
    avg_gender_bias = 0
    count = 0
    #list_words = annonce.split() 
    for word in annonce:
        bias = gender_bias_dict[word] 
        if bias != -1000.0:
            gender_bias_total += bias 
            count += 1
    return float((gender_bias_total / count))

In [None]:
# after specifying the model and words we can calculate an avage score of all the words in each job description and append score to our dataframe  
gender_bias = []
for i in df_tags['des_clean']:
    cal_bias = calculate_gender_bias(i, gender_bias_dict)
    gender_bias.append(cal_bias)
#appening column to our dataframe     
df_tags["man_woman"] = gender_bias
#saving full dataframe as csv 
df_tags.to_csv("df_tags_28dec.csv")

In [None]:
#extracting gender bias score for each word in our gender bias dictionary 
pd_bias = pd.DataFrame.from_dict(gender_bias_dict, orient='index')
#save csv with gender scores 
pd_bias.to_csv("gender_bias_dict.csv")

# Normalizing salary rates 

In [None]:
def format_salary(row):
    salary = row["Salary"]
    if "-" in salary:
        split = salary.split("-")
        salary_min = split[0]
        salary_max = split[1]
    else:
        salary_min = salary
        salary_max = salary
    row["salary_min"] = salary_min.replace("$","").replace("a month", "").replace("a year","").replace(",","").replace("an hour","").replace(",","").replace("a week","").replace(",","")
    row["salary_max"] = salary_max.replace("$","").replace("a month", "").replace("a year","").replace(",","").replace("an hour","").replace(",","").replace("a week","").replace(",","")
    
    if "month" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])*12
        row["salary_max"] = float(row["salary_max"])*12
    if "hour" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])*1710
        row["salary_max"] = float(row["salary_max"])*1710
    if "week" in row["Salary"]:
        row["salary_min"] = float(row["salary_min"])*45
        row["salary_max"] = float(row["salary_max"])*45       
    
    return row

df_salary = data[data["Salary"]!= "None"].dropna()
df_salary = df_salary.apply(format_salary,axis=1)

df_salary["salary_min"] = pd.to_numeric(df_salary["salary_min"],'coerce')
df_salary["salary_max"] = pd.to_numeric(df_salary["salary_max"],'coerce')
df_salary["salary_mean"] = (df_salary["salary_min"]+df_salary["salary_max"])/2

df_salary.to_csv("df_salary27dec.csv")

In [None]:
vec = model_w2v["computer_programmer"] - model_w2v["man"] + model_w2v["woman"]
model_w2v.most_similar([vec])