# Importing libraries

In [1]:
import pdftotext
import pandas as pd
import string
import re
import os
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import gensim
import numpy as np
from sklearn import metrics

# Load text from documents

In [2]:
CURRENT_PATH = os.getcwd()

# Load text from documents
def loadTextFromFile(directory, filenames, docs):
    trainingDirectory = CURRENT_PATH + "/" + directory
    for filename in os.listdir(trainingDirectory):
        filenames.append(int(filename[:-4]))  # Removes the .txt from the filename
        with open(trainingDirectory + "/" + filename, "r") as file:
            text = file.read()
        docs.append(text)


diversityInclusion = "diversityInclusion"
diversityInclusionFilenames = []
diversityInclusionDocs = []
diversityInclusionTestFilenames = []
diversityInclusionTestDocs = []
loadTextFromFile(
    diversityInclusion + "/" + "training",
    diversityInclusionFilenames,
    diversityInclusionDocs,
)
loadTextFromFile(
    diversityInclusion + "/" + "test",
    diversityInclusionTestFilenames,
    diversityInclusionTestDocs,
)


encourageGenders = "encourageGenders"
encourageGendersFilenames = []
encourageGendersDocs = []
encourageGendersTestFilenames = []
encourageGendersTestDocs = []
loadTextFromFile(
    encourageGenders + "/" + "training",
    encourageGendersFilenames,
    encourageGendersDocs,
)
loadTextFromFile(
    encourageGenders + "/" + "test",
    encourageGendersTestFilenames,
    encourageGendersTestDocs,
)

mentionOrgFeatures = "mentionOrgFeatures"
mentionOrgFeaturesFilenames = []
mentionOrgFeaturesDocs = []
mentionOrgFeaturesTestFilenames = []
mentionOrgFeaturesTestDocs = []
loadTextFromFile(
    mentionOrgFeatures + "/" + "training",
    mentionOrgFeaturesFilenames,
    mentionOrgFeaturesDocs,
)
loadTextFromFile(
    mentionOrgFeatures + "/" + "test",
    mentionOrgFeaturesTestFilenames,
    mentionOrgFeaturesTestDocs,
)

# Preparing a dataframe

In [3]:
# Builds a table with two columns: filename ('id'), and the text from the file (the job ad)

diversityInclusionData = pd.DataFrame()
diversityInclusionData["id"] = diversityInclusionFilenames
diversityInclusionData["text"] = diversityInclusionDocs
diversityInclusionTestData = pd.DataFrame()
diversityInclusionTestData["id"] = diversityInclusionTestFilenames
diversityInclusionTestData["text"] = diversityInclusionTestDocs

encourageGendersData = pd.DataFrame()
encourageGendersData["id"] = encourageGendersFilenames
encourageGendersData["text"] = encourageGendersDocs
encourageGendersTestData = pd.DataFrame()
encourageGendersTestData["id"] = encourageGendersTestFilenames
encourageGendersTestData["text"] = encourageGendersTestDocs

mentionOrgFeaturesData = pd.DataFrame()
mentionOrgFeaturesData["id"] = mentionOrgFeaturesFilenames
mentionOrgFeaturesData["text"] = mentionOrgFeaturesDocs
mentionOrgFeaturesTestData = pd.DataFrame()
mentionOrgFeaturesTestData["id"] = mentionOrgFeaturesTestFilenames
mentionOrgFeaturesTestData["text"] = mentionOrgFeaturesTestDocs

# Cleaning text

In [4]:
def clean_text(text):
    # Remove punctutation
    text = re.sub("[^a-zA-Z]", " ", text)
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    # Convert to lower
    text = text.lower()
    # Remove whitespaces
    text = " ".join(text.split())
    return text


diversityInclusionData["text"] = diversityInclusionData["text"].apply(
    lambda x: clean_text(x)
)
diversityInclusionTestData["text"] = diversityInclusionTestData["text"].apply(
    lambda x: clean_text(x)
)

encourageGendersTestData["text"] = encourageGendersData["text"].apply(
    lambda x: clean_text(x)
)
encourageGendersTestData["text"] = encourageGendersTestData["text"].apply(
    lambda x: clean_text(x)
)

mentionOrgFeaturesData["text"] = mentionOrgFeaturesData["text"].apply(
    lambda x: clean_text(x)
)
mentionOrgFeaturesTestData["text"] = mentionOrgFeaturesTestData["text"].apply(
    lambda x: clean_text(x)
)

# Remove stopwords

In [5]:
stop = set(stopwords.words("english"))

# A function to remove stopwords and short length words (< 2)
def remove_stopwords(text):
    new = []
    for word in text.split():
        if word not in stop and len(word) > 1:
            new.append(word)

    return " ".join(new)


diversityInclusionData["text"] = diversityInclusionData["text"].apply(
    lambda x: remove_stopwords(x)
)
diversityInclusionTestData["text"] = diversityInclusionTestData["text"].apply(
    lambda x: remove_stopwords(x)
)

encourageGendersData["text"] = encourageGendersData["text"].apply(
    lambda x: remove_stopwords(x)
)
encourageGendersTestData["text"] = encourageGendersTestData["text"].apply(
    lambda x: remove_stopwords(x)
)

mentionOrgFeaturesData["text"] = mentionOrgFeaturesData["text"].apply(
    lambda x: remove_stopwords(x)
)
mentionOrgFeaturesTestData["text"] = mentionOrgFeaturesTestData["text"].apply(
    lambda x: remove_stopwords(x)
)

# Reading pre-labeled target classes

In [6]:
diversityInclusionLabels = pd.read_csv(
    CURRENT_PATH + "/labels/diversityInclusion.csv"
)
diversityInclusionData = pd.merge(diversityInclusionData, diversityInclusionLabels)
diversityInclusionTestLabels = pd.read_csv(
    CURRENT_PATH + "/labels/diversityInclusionTest.csv"
)
diversityInclusionTestData = pd.merge(
    diversityInclusionTestData, diversityInclusionTestLabels
)


encourageGendersLabels = pd.read_csv(CURRENT_PATH + "/labels/encourageGenders.csv")
encourageGendersData = pd.merge(encourageGendersData, encourageGendersLabels)
encourageGendersTestLabels = pd.read_csv(
    CURRENT_PATH + "/labels/encourageGendersTest.csv"
)
encourageGendersTestData = pd.merge(
    encourageGendersTestData, encourageGendersTestLabels
)


mentionOrgFeaturesLabels = pd.read_csv(
    CURRENT_PATH + "/labels/mentionOrgFeatures.csv"
)
mentionOrgFeaturesData = pd.merge(mentionOrgFeaturesData, mentionOrgFeaturesLabels)
mentionOrgFeaturesTestLabels = pd.read_csv(
    CURRENT_PATH + "/labels/mentionOrgFeaturesTest.csv"
)
mentionOrgFeaturesTestData = pd.merge(
    mentionOrgFeaturesTestData, mentionOrgFeaturesTestLabels
)

# Using word2vec for manually training embeddings

In [7]:
# model = gensim.models.Word2Vec(
#         window = 10,
#         min_count = 2,
#         workers = 4
# )

# model.build_vocab(train['tokens'], progress_per=1000)

# model.epochs

# model.corpus_count

# model.train(train['text'], total_examples=model.corpus_count, epochs=model.epochs)

# model.save("./word-2-vec.model")

# model.wv.most_similar("male")

# Using pre-trained Glove word embeddings

In [8]:
from gensim.scripts.glove2word2vec import glove2word2vec

# need to download the model from https://nlp.stanford.edu/projects/glove/
# then add to directory
glove_path = 'glove.twitter.27B.100d.txt'
word2vec_output_file = 'MY_MODEL'+'.word2vec'

glove2word2vec(glove_path, word2vec_output_file)

from gensim.models import KeyedVectors

GloVe model
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)


In [9]:
# from gensim.models import KeyedVectors

# # Lexvec model
# model = KeyedVectors.load_word2vec_format('lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors', binary=False)


In [10]:
#Show a word embedding
print('King: ',model.get_vector('king'))

result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

print('Most similar word to King + Woman: ', result)

King:  [-0.099438 -0.03811  -0.02814  -0.086827 -0.058872 -0.074074 -0.089723
  0.007647 -0.079409 -0.077722  0.012145 -0.087492  0.125281  0.077101
 -0.017215  0.011123  0.011952 -0.002687 -0.119687  0.039859  0.037061
 -0.070333  0.015584 -0.232619 -0.076024 -0.092917 -0.015832  0.024597
  0.022748 -0.081143 -0.002227  0.061346  0.185452  0.161941 -0.021561
  0.006761  0.012353 -0.044067  0.057281 -0.12141  -0.114887  0.096228
  0.107898 -0.173222  0.014622  0.023282  0.026272 -0.203771  0.113475
  0.020428  0.05133   0.032807  0.042183 -0.061823  0.072173  0.065524
 -0.212801  0.071362 -0.080849 -0.019219 -0.018223 -0.255929  0.106209
 -0.002017  0.083529  0.109361 -0.121694  0.097796  0.112664  0.069995
  0.093611 -0.048229  0.094784 -0.035936 -0.063447 -0.056852  0.080355
 -0.013907 -0.102584  0.033997 -0.072398 -0.046312  0.126431 -0.010575
 -0.144504  0.159152 -0.024255 -0.002653 -0.074009  0.009565  0.117276
 -0.007985  0.108225  0.141307  0.12105   0.002244 -0.109612  0.04216


In [11]:
# Find similar words to diversity
model.most_similar('diversity')

[('richness', 0.6089744567871094),
 ('Diversity', 0.5902432799339294),
 ('diverse', 0.5788357853889465),
 ('inclusiveness', 0.5460476279258728),
 ('uniqueness', 0.5407953262329102),
 ('pluralism', 0.5313884019851685),
 ("diversity's", 0.4992821514606476),
 ('diversities', 0.49747776985168457),
 ("'diversity", 0.4915635585784912),
 ('heritage', 0.48739439249038696)]

# Creating a class for vectorizing the sentences

Source: https://edumunozsala.github.io/BlogEms/jupyter/nlp/classification/embeddings/python/2020/08/15/Intro_NLP_WordEmbeddings_Classification.html

In [12]:
model.vector_size

300

In [13]:
class Word2VecVectorizer:
  def __init__(self, model):
    print("Loading in word vectors...")
    self.word_vectors = model
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
#     v = self.word_vectors.get_vector('king')
#     self.D = v.shape[0]
    self.D = model.vector_size

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

In [14]:
# Set a word vectorizer
vectorizer = Word2VecVectorizer(model)

Loading in word vectors...
Finished loading in word vectors


# Embedding each document and splitting into train test data

In [15]:

diversityInclusionText = vectorizer.fit_transform(diversityInclusionData["text"])
diversityInclusionLabels = np.array(diversityInclusionData[diversityInclusion])
diversityInclusionTextFeatures = diversityInclusionText


diversityInclusionTestText = vectorizer.fit_transform(diversityInclusionTestData["text"])
diversityInclusionTestLabels = np.array(diversityInclusionTestData[diversityInclusion])
diversityInclusionTestTextFeatures = diversityInclusionTestText


encourageGendersText = vectorizer.fit_transform(encourageGendersData["text"])
encourageGendersLabels = np.array(encourageGendersData[encourageGenders])
encourageGendersTextFeatures = encourageGendersText #do we need to make it into an array from numpy

encourageGendersTestText = vectorizer.fit_transform(
    encourageGendersTestData["text"]
)
encourageGendersTestLabels = np.array(encourageGendersTestData[encourageGenders])
encourageGendersTestTextFeatures = encourageGendersTestText


mentionOrgFeaturesText = vectorizer.fit_transform(
    mentionOrgFeaturesData["text"]
)
mentionOrgFeaturesLabels = np.array(mentionOrgFeaturesData[mentionOrgFeatures])
mentionOrgFeaturesTextFeatures = mentionOrgFeaturesText

mentionOrgFeaturesTestText = vectorizer.fit_transform(
    mentionOrgFeaturesTestData["text"]
)
mentionOrgFeaturesTestLabels = np.array(mentionOrgFeaturesTestData[mentionOrgFeatures])
mentionOrgFeaturesTestTextFeatures = mentionOrgFeaturesTestText



trainingData = [
    [diversityInclusionTextFeatures, diversityInclusionLabels],
    [encourageGendersTextFeatures, encourageGendersLabels],
    [mentionOrgFeaturesTextFeatures, mentionOrgFeaturesLabels],
]
print(trainingData)

Numer of samples with no words found: 0 / 7
Numer of samples with no words found: 0 / 3
Numer of samples with no words found: 0 / 5
Numer of samples with no words found: 0 / 4
Numer of samples with no words found: 0 / 8
Numer of samples with no words found: 0 / 4
[[array([[-0.00804881,  0.062328  , -0.0291645 , ..., -0.03117794,
        -0.00888256,  0.06725182],
       [-0.00181986,  0.05755581, -0.01650376, ..., -0.039441  ,
         0.00648553,  0.03903401],
       [-0.00280441,  0.05537559, -0.01449772, ..., -0.03154269,
        -0.01114962,  0.04632628],
       ...,
       [ 0.016755  ,  0.04926629, -0.04735929, ..., -0.03480594,
        -0.04079629,  0.05039847],
       [-0.01481105,  0.03521071, -0.02987215, ..., -0.02515395,
         0.00897514,  0.05121523],
       [-0.00270411,  0.050753  , -0.01389973, ..., -0.02832008,
         0.00645187,  0.01817235]]), array([1, 0, 1, 1, 1, 1, 1])], [array([[ 0.01952   ,  0.0242138 , -0.030083  , ..., -0.0551442 ,
        -0.0313702 ,  0

# Creating and evluating a machine learning model 

In [16]:

models = []

for i in range(len(trainingData)):
    rf = RandomForestClassifier(n_estimators=200)
    models.append(rf.fit(trainingData[i][0], trainingData[i][1])) #words[i][0], labels[i][1]

predictions = []
predictions.append(models[0].predict(diversityInclusionTestTextFeatures))
predictions.append(
    models[1].predict(encourageGendersTestTextFeatures)
)  # Encourage both genders
predictions.append(
    models[2].predict(mentionOrgFeaturesTestTextFeatures)
)  # Mention of work features

# Evaluating precisions
print("\t Classification report for", diversityInclusion, "\n")
print(
    metrics.classification_report(
        diversityInclusionTestLabels, predictions[0], digits=5
    )
)
print("-------------------------------------------------------------------------")

print("\t Classification report for", encourageGenders, "\n")
print(
    metrics.classification_report(encourageGendersTestLabels, predictions[1], digits=5)
)
print("-------------------------------------------------------------------------")

print("\t Classification report for", mentionOrgFeatures, "\n")
print(
    metrics.classification_report(
        mentionOrgFeaturesTestLabels, predictions[2], digits=5
    )
)

	 Classification report for diversityInclusion 

              precision    recall  f1-score   support

           0    1.00000   0.50000   0.66667         2
           1    0.50000   1.00000   0.66667         1

    accuracy                        0.66667         3
   macro avg    0.75000   0.75000   0.66667         3
weighted avg    0.83333   0.66667   0.66667         3

-------------------------------------------------------------------------
	 Classification report for encourageGenders 

              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         3
           1    0.25000   1.00000   0.40000         1

    accuracy                        0.25000         4
   macro avg    0.12500   0.50000   0.20000         4
weighted avg    0.06250   0.25000   0.10000         4

-------------------------------------------------------------------------
	 Classification report for mentionOrgFeatures 

              precision    recall  f1-score   support

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
