In [1]:
import pandas as pd
import numpy as np
import torch
import gensim

from gensim.scripts.glove2word2vec import glove2word2vec

## WE RAN THIS IN GOOGLE COLLAB

In [2]:
train_df = pd.read_csv("../Data/train_data.csv")
test_df = pd.read_csv("../Data/test_data.csv")

In [3]:
# define a helper function to calculate the embedding vector of each text
def get_embeddings(vectors, text, k, generate_missing=False):
    # dealing with empty text
    if len(text)<1:
        return np.zeros(k)
    # generate randomized vectors for unseen words if generate_missing is True
    if generate_missing:
        vectorized = [vectors[word][:k] if word in vectors else np.random.rand(k) for word in text]
    # represent unseen words with 0 vector if generate_missing is False
    else:
        vectorized = [vectors[word][:k] if word in vectors else np.zeros(k) for word in text]
    # each text is represented by averaging the vectors of its constituent words
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

In [4]:
def check_corr(dataframe):
  # dataframe = pd.DataFrame(dataframe.tolist())
  # print(train_df)
  train_df['none'] = train_df['fraudulent'].apply(lambda x: 1 if x == 0 else 0)
  dataframe[['fraudulent', 'none']] = train_df[['fraudulent', 'none']]
  columns = ['fraudulent', 'none']
  features_glove = list(range(200))
  rows_glove = [{c:dataframe[str(f)].corr(dataframe[str(c)]) for c in columns} for f in features_glove]
  train_correlations_glove = pd.DataFrame(rows_glove, index=features_glove)
  # print(train_correlations_glove)
  useful = train_correlations_glove[abs(train_correlations_glove['fraudulent']) > 0.2]
  return useful

In [None]:
# generate a word2vec file used for model building - 200d
glove_input_file = "../glove.42B.300d.txt"
word2vec_output_file = "../glove.42B.300d.txt.word2vec"
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
# build a GloVe model
glove_model = gensim.models.KeyedVectors.load_word2vec_format("../glove.42B.300d.txt.word2vec", binary=False, limit=1190000)

In [None]:
# generate embedding vectors of size 200 using tokenized text
# side note: map function outperforms for loop
cols_to_run = ["tokenized_company_profile", "tokenized_description", "tokenized_requirements", "tokenized_benefits"]

req_train_embeddings_glove = train_df["tokenized_requirements"].map(lambda x: get_embeddings(glove_model, x, 200))
prof_train_embeddings_glove = train_df["tokenized_company_profile"].map(lambda x: get_embeddings(glove_model, x, 200))
desc_train_embeddings_glove = train_df["tokenized_description"].map(lambda x: get_embeddings(glove_model, x, 200))
ben_train_embeddings_glove = train_df["tokenized_benefits"].map(lambda x: get_embeddings(glove_model, x, 200))

req_train_embeddings_glove = pd.DataFrame(req_train_embeddings_glove.tolist())
prof_train_embeddings_glove = pd.DataFrame(prof_train_embeddings_glove.tolist())
desc_train_embeddings_glove = pd.DataFrame(desc_train_embeddings_glove.tolist())
ben_train_embeddings_glove = pd.DataFrame(ben_train_embeddings_glove.tolist())

In [None]:
req_test_embeddings_glove = test_df["tokenized_requirements"].map(lambda x: get_embeddings(glove_model, x, 200))
prof_test_embeddings_glove = test_df["tokenized_company_profile"].map(lambda x: get_embeddings(glove_model, x, 200))
desc_test_embeddings_glove = test_df["tokenized_description"].map(lambda x: get_embeddings(glove_model, x, 200))
ben_test_embeddings_glove = test_df["tokenized_benefits"].map(lambda x: get_embeddings(glove_model, x, 200))

req_test_embeddings_glove = pd.DataFrame(req_test_embeddings_glove.tolist())
prof_test_embeddings_glove = pd.DataFrame(prof_test_embeddings_glove.tolist())
desc_test_embeddings_glove = pd.DataFrame(desc_test_embeddings_glove.tolist())
ben_test_embeddings_glove = pd.DataFrame(ben_test_embeddings_glove.tolist())

In [None]:
embeddings_glove = pd.DataFrame(embeddings_glove.tolist())
embeddings_glove[['fraudulent', 'none']] = train_df[['fraudulent', 'none']]
columns = ['fraudulent', 'none']
features_glove = list(range(200))
rows_glove = [{c:embeddings_glove[f].corr(embeddings_glove[c]) for c in columns} for f in features_glove]
train_correlations_glove = pd.DataFrame(rows_glove, index=features_glove)
train_correlations_glove

In [None]:
req_train_embeddings_glove.to_csv('req_train_embeddings_glove.csv', index=False)
prof_train_embeddings_glove.to_csv('prof_train_embeddings_glove.csv', index=False)
desc_train_embeddings_glove.to_csv('desc_train_embeddings_glove.csv', index=False)
ben_train_embeddings_glove.to_csv('ben_train_embeddings_glove.csv', index=False)

In [None]:
req_test_embeddings_glove.to_csv('req_test_embeddings_glove.csv', index=False)
prof_test_embeddings_glove.to_csv('prof_test_embeddings_glove.csv', index=False)
desc_test_embeddings_glove.to_csv('desc_test_embeddings_glove.csv', index=False)
ben_test_embeddings_glove.to_csv('ben_test_embeddings_glove.csv', index=False)

In [None]:
req_train = check_corr(req_train_embeddings_glove)
prof_train = check_corr(prof_train_embeddings_glove)
desc_train = check_corr(desc_train_embeddings_glove)
ben_train = check_corr(ben_train_embeddings_glove)

In [None]:
print(len(req_train))
print(len(prof_train))
print(len(desc_train))
print(len(ben_train))