# Importing libraries

In [25]:
import pdftotext
import pandas as pd
import string
import re
import os
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import gensim
import numpy as np
from sklearn import metrics

# Load text from documents

In [26]:
CURRENT_PATH = os.getcwd()

# Load text from documents
def loadTextFromFile(directory, filenames, docs):
    trainingDirectory = CURRENT_PATH + "/" + directory
    for filename in os.listdir(trainingDirectory):
        filenames.append(int(filename[:-4]))  # Removes the .txt from the filename
        with open(trainingDirectory + "/" + filename, "r") as file:
            text = file.read()
        docs.append(text)


diversityInclusion = "diversityInclusion"
diversityInclusionFilenames = []
diversityInclusionDocs = []
diversityInclusionTestFilenames = []
diversityInclusionTestDocs = []
loadTextFromFile(
    diversityInclusion + "/" + "training",
    diversityInclusionFilenames,
    diversityInclusionDocs,
)
loadTextFromFile(
    diversityInclusion + "/" + "test",
    diversityInclusionTestFilenames,
    diversityInclusionTestDocs,
)


encourageGenders = "encourageGenders"
encourageGendersFilenames = []
encourageGendersDocs = []
encourageGendersTestFilenames = []
encourageGendersTestDocs = []
loadTextFromFile(
    encourageGenders + "/" + "training",
    encourageGendersFilenames,
    encourageGendersDocs,
)
loadTextFromFile(
    encourageGenders + "/" + "test",
    encourageGendersTestFilenames,
    encourageGendersTestDocs,
)

mentionOrgFeatures = "mentionOrgFeatures"
mentionOrgFeaturesFilenames = []
mentionOrgFeaturesDocs = []
mentionOrgFeaturesTestFilenames = []
mentionOrgFeaturesTestDocs = []
loadTextFromFile(
    mentionOrgFeatures + "/" + "training",
    mentionOrgFeaturesFilenames,
    mentionOrgFeaturesDocs,
)
loadTextFromFile(
    mentionOrgFeatures + "/" + "test",
    mentionOrgFeaturesTestFilenames,
    mentionOrgFeaturesTestDocs,
)

# Preparing a dataframe

In [27]:
# Builds a table with two columns: filename ('id'), and the text from the file (the job ad)

diversityInclusionData = pd.DataFrame()
diversityInclusionData["id"] = diversityInclusionFilenames
diversityInclusionData["text"] = diversityInclusionDocs
diversityInclusionTestData = pd.DataFrame()
diversityInclusionTestData["id"] = diversityInclusionTestFilenames
diversityInclusionTestData["text"] = diversityInclusionTestDocs

encourageGendersData = pd.DataFrame()
encourageGendersData["id"] = encourageGendersFilenames
encourageGendersData["text"] = encourageGendersDocs
encourageGendersTestData = pd.DataFrame()
encourageGendersTestData["id"] = encourageGendersTestFilenames
encourageGendersTestData["text"] = encourageGendersTestDocs

mentionOrgFeaturesData = pd.DataFrame()
mentionOrgFeaturesData["id"] = mentionOrgFeaturesFilenames
mentionOrgFeaturesData["text"] = mentionOrgFeaturesDocs
mentionOrgFeaturesTestData = pd.DataFrame()
mentionOrgFeaturesTestData["id"] = mentionOrgFeaturesTestFilenames
mentionOrgFeaturesTestData["text"] = mentionOrgFeaturesTestDocs

# Cleaning text

In [28]:
def clean_text(text):
    # Remove punctutation
    text = re.sub("[^a-zA-Z]", " ", text)
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    # Convert to lower
    text = text.lower()
    # Remove whitespaces
    text = " ".join(text.split())
    return text


diversityInclusionData["text"] = diversityInclusionData["text"].apply(
    lambda x: clean_text(x)
)
diversityInclusionTestData["text"] = diversityInclusionTestData["text"].apply(
    lambda x: clean_text(x)
)

encourageGendersTestData["text"] = encourageGendersData["text"].apply(
    lambda x: clean_text(x)
)
encourageGendersTestData["text"] = encourageGendersTestData["text"].apply(
    lambda x: clean_text(x)
)

mentionOrgFeaturesData["text"] = mentionOrgFeaturesData["text"].apply(
    lambda x: clean_text(x)
)
mentionOrgFeaturesTestData["text"] = mentionOrgFeaturesTestData["text"].apply(
    lambda x: clean_text(x)
)

# Remove stopwords

In [29]:
stop = set(stopwords.words("english"))

# A function to remove stopwords and short length words (< 2)
def remove_stopwords(text):
    new = []
    for word in text.split():
        if word not in stop and len(word) > 1:
            new.append(word)

    return " ".join(new)


diversityInclusionData["text"] = diversityInclusionData["text"].apply(
    lambda x: remove_stopwords(x)
)
diversityInclusionTestData["text"] = diversityInclusionTestData["text"].apply(
    lambda x: remove_stopwords(x)
)

encourageGendersData["text"] = encourageGendersData["text"].apply(
    lambda x: remove_stopwords(x)
)
encourageGendersTestData["text"] = encourageGendersTestData["text"].apply(
    lambda x: remove_stopwords(x)
)

mentionOrgFeaturesData["text"] = mentionOrgFeaturesData["text"].apply(
    lambda x: remove_stopwords(x)
)
mentionOrgFeaturesTestData["text"] = mentionOrgFeaturesTestData["text"].apply(
    lambda x: remove_stopwords(x)
)

# Reading pre-labeled target classes

In [30]:
diversityInclusionLabels = pd.read_csv(
    CURRENT_PATH + "/labels/diversityInclusion.csv"
)
diversityInclusionData = pd.merge(diversityInclusionData, diversityInclusionLabels)
diversityInclusionTestLabels = pd.read_csv(
    CURRENT_PATH + "/labels/diversityInclusionTest.csv"
)
diversityInclusionTestData = pd.merge(
    diversityInclusionTestData, diversityInclusionTestLabels
)


encourageGendersLabels = pd.read_csv(CURRENT_PATH + "/labels/encourageGenders.csv")
encourageGendersData = pd.merge(encourageGendersData, encourageGendersLabels)
encourageGendersTestLabels = pd.read_csv(
    CURRENT_PATH + "/labels/encourageGendersTest.csv"
)
encourageGendersTestData = pd.merge(
    encourageGendersTestData, encourageGendersTestLabels
)


mentionOrgFeaturesLabels = pd.read_csv(
    CURRENT_PATH + "/labels/mentionOrgFeatures.csv"
)
mentionOrgFeaturesData = pd.merge(mentionOrgFeaturesData, mentionOrgFeaturesLabels)
mentionOrgFeaturesTestLabels = pd.read_csv(
    CURRENT_PATH + "/labels/mentionOrgFeaturesTest.csv"
)
mentionOrgFeaturesTestData = pd.merge(
    mentionOrgFeaturesTestData, mentionOrgFeaturesTestLabels
)

# Using word2vec for manually training embeddings (NOT used, from suvansh's code)

In [31]:
# model = gensim.models.Word2Vec(
#         window = 10,
#         min_count = 2,
#         workers = 4
# )

# model.build_vocab(train['tokens'], progress_per=1000)

# model.epochs

# model.corpus_count

# model.train(train['text'], total_examples=model.corpus_count, epochs=model.epochs)

# model.save("./word-2-vec.model")

# model.wv.most_similar("male")

# Using pre-trained Glove word embeddings

In [32]:
from gensim.scripts.glove2word2vec import glove2word2vec

# need to download the model from https://nlp.stanford.edu/projects/glove/
# then add to directory
glove_path = 'glove.twitter.27B.100d.txt'
word2vec_output_file = 'MY_MODEL'+'.word2vec'

glove2word2vec(glove_path, word2vec_output_file)

from gensim.models import KeyedVectors

# GloVe model
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)


  glove2word2vec(glove_path, word2vec_output_file)


In [33]:
# from gensim.models import KeyedVectors

# # Lexvec model
# model = KeyedVectors.load_word2vec_format('lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors', binary=False)


In [34]:
#Show a word embedding
print('King: ',model.get_vector('king'))

result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

print('Most similar word to King + Woman: ', result)

King:  [-3.7500e-01 -2.7532e-01  1.2489e-01 -9.2143e-02 -4.3104e-01  2.5268e-02
 -4.1867e-02  1.2848e-01 -7.9363e-02 -1.0011e-01  1.4076e-01  1.0922e-01
 -3.4546e+00 -6.9851e-01  6.6580e-01  5.1494e-01  4.5912e-01 -2.1957e-01
  4.4094e-01 -3.0631e-01  1.2293e-01 -9.9830e-02 -2.5755e-01 -6.1872e-01
  1.0613e+00 -9.4278e-01  1.9284e-01 -8.2089e-02  2.7782e-01 -1.8595e-01
  2.9140e-02 -3.0870e-01 -3.9870e-01 -4.3038e-01  3.8403e-01  3.3243e-01
 -1.4446e-01  1.6682e-01  4.2301e-01 -2.6490e-01 -7.8106e-02 -4.6756e-01
 -3.4039e-01 -1.3690e-01  7.0890e-01 -4.8015e-01  8.9183e-02 -2.3709e-01
  7.5124e-01  2.0507e-01 -5.5263e-01 -3.8105e-01 -7.7082e-02  3.6118e-01
 -8.9840e-01 -5.3537e-01  3.3161e-01 -1.3460e-01 -5.7742e-02  1.9428e-01
  1.8008e-01 -4.0697e-01  2.6654e-03 -7.8771e-02 -2.3616e-01 -9.8115e-01
 -1.6823e-01  1.1459e-01 -2.7011e-01 -2.1435e-02  2.3491e-01 -1.1341e+00
 -3.3837e-01  1.6548e-01  5.3073e-01 -3.0098e-01 -3.6769e-01  4.2092e-01
  1.4201e-01  1.7346e-02  7.8406e-01  3.1441

In [35]:
# Find similar words to diversity
model.most_similar('diversity')

[('inclusion', 0.7886667847633362),
 ('sustainability', 0.744009256362915),
 ('equality', 0.7191735506057739),
 ('empowerment', 0.717718780040741),
 ('unity', 0.6994752883911133),
 ('leadership', 0.6993807554244995),
 ('advocacy', 0.696628749370575),
 ('innovation', 0.6965843439102173),
 ('initiative', 0.686913013458252),
 ('environmental', 0.685417115688324)]

# Creating a class for vectorizing the sentences

Source: https://edumunozsala.github.io/BlogEms/jupyter/nlp/classification/embeddings/python/2020/08/15/Intro_NLP_WordEmbeddings_Classification.html

In [36]:
model.vector_size

100

In [37]:
class Word2VecVectorizer:
  def __init__(self, model):
    print("Loading in word vectors...")
    self.word_vectors = model
    print("Finished loading in word vectors")

  def fit(self, data):
    pass

  def transform(self, data):
    # determine the dimensionality of vectors
#     v = self.word_vectors.get_vector('king')
#     self.D = v.shape[0]
    self.D = model.vector_size

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          # throws KeyError if word not found
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X


  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

In [38]:
# Set a word vectorizer
vectorizer = Word2VecVectorizer(model)

Loading in word vectors...
Finished loading in word vectors


# Embedding each document and splitting into train test data

In [39]:

diversityInclusionText = vectorizer.fit_transform(diversityInclusionData["text"])
diversityInclusionLabels = np.array(diversityInclusionData[diversityInclusion])
diversityInclusionTextFeatures = diversityInclusionText


diversityInclusionTestText = vectorizer.fit_transform(diversityInclusionTestData["text"])
diversityInclusionTestLabels = np.array(diversityInclusionTestData[diversityInclusion])
diversityInclusionTestTextFeatures = diversityInclusionTestText


encourageGendersText = vectorizer.fit_transform(encourageGendersData["text"])
encourageGendersLabels = np.array(encourageGendersData[encourageGenders])
encourageGendersTextFeatures = encourageGendersText #do we need to make it into an array from numpy

encourageGendersTestText = vectorizer.fit_transform(
    encourageGendersTestData["text"]
)
encourageGendersTestLabels = np.array(encourageGendersTestData[encourageGenders])
encourageGendersTestTextFeatures = encourageGendersTestText


mentionOrgFeaturesText = vectorizer.fit_transform(
    mentionOrgFeaturesData["text"]
)
mentionOrgFeaturesLabels = np.array(mentionOrgFeaturesData[mentionOrgFeatures])
mentionOrgFeaturesTextFeatures = mentionOrgFeaturesText

mentionOrgFeaturesTestText = vectorizer.fit_transform(
    mentionOrgFeaturesTestData["text"]
)
mentionOrgFeaturesTestLabels = np.array(mentionOrgFeaturesTestData[mentionOrgFeatures])
mentionOrgFeaturesTestTextFeatures = mentionOrgFeaturesTestText



trainingData = [
    [diversityInclusionTextFeatures, diversityInclusionLabels],
    [encourageGendersTextFeatures, encourageGendersLabels],
    [mentionOrgFeaturesTextFeatures, mentionOrgFeaturesLabels],
]
print(trainingData)

Numer of samples with no words found: 0 / 7
Numer of samples with no words found: 0 / 3
Numer of samples with no words found: 0 / 5
Numer of samples with no words found: 0 / 4
Numer of samples with no words found: 0 / 8
Numer of samples with no words found: 0 / 4
[[array([[ 2.06775069e-02, -8.88405293e-02,  4.78706211e-02,
         2.26312742e-01,  2.44121984e-01,  2.83307433e-01,
         1.01545908e-01, -2.69428372e-01, -1.17022440e-01,
        -3.99107218e-01,  1.11336902e-01, -6.40310466e-01,
        -2.44136882e+00,  2.93573380e-01, -2.83161923e-02,
        -4.26626354e-02, -2.37588555e-01, -3.11152525e-02,
        -2.05242828e-01, -1.78342953e-01,  4.62761462e-01,
         2.81835765e-01,  5.73294982e-02,  1.37503639e-01,
        -3.46926093e-01,  9.49593902e-01,  2.74707619e-02,
         5.04210114e-01, -6.43857867e-02,  4.59671281e-02,
         1.34607792e-01, -9.36821923e-02, -5.01914434e-02,
        -4.27486263e-02,  1.99345365e-01, -6.24155626e-02,
         7.71201253e-02,  

# Creating and evluating a machine learning model 

In [40]:

models = []

for i in range(len(trainingData)):
    rf = RandomForestClassifier(n_estimators=200)
    models.append(rf.fit(trainingData[i][0], trainingData[i][1])) #words[i][0], labels[i][1]

predictions = []
predictions.append(models[0].predict(diversityInclusionTestTextFeatures))
predictions.append(
    models[1].predict(encourageGendersTestTextFeatures)
)  # Encourage both genders
predictions.append(
    models[2].predict(mentionOrgFeaturesTestTextFeatures)
)  # Mention of work features

# Evaluating precisions
print("\t Classification report for", diversityInclusion, "\n")
print(
    metrics.classification_report(
        diversityInclusionTestLabels, predictions[0], digits=5
    )
)
print("-------------------------------------------------------------------------")

print("\t Classification report for", encourageGenders, "\n")
print(
    metrics.classification_report(encourageGendersTestLabels, predictions[1], digits=5)
)
print("-------------------------------------------------------------------------")

print("\t Classification report for", mentionOrgFeatures, "\n")
print(
    metrics.classification_report(
        mentionOrgFeaturesTestLabels, predictions[2], digits=5
    )
)

	 Classification report for diversityInclusion 

              precision    recall  f1-score   support

           0    1.00000   0.50000   0.66667         2
           1    0.50000   1.00000   0.66667         1

    accuracy                        0.66667         3
   macro avg    0.75000   0.75000   0.66667         3
weighted avg    0.83333   0.66667   0.66667         3

-------------------------------------------------------------------------
	 Classification report for encourageGenders 

              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000       3.0
           1    0.00000   0.00000   0.00000       1.0

    accuracy                        0.00000       4.0
   macro avg    0.00000   0.00000   0.00000       4.0
weighted avg    0.00000   0.00000   0.00000       4.0

-------------------------------------------------------------------------
	 Classification report for mentionOrgFeatures 

              precision    recall  f1-score   support