## Import Modules

In [23]:
import pandas as pd
import numpy as np 
from gensim import models
from lib import weat
import os
import scipy
import matplotlib.pyplot as plt
import importlib

## Load Word2Vec and Glove Models

In [2]:
# import and load word2vec model using Google News data
def loadWord2VecModel(file):
    print("Loading word2vec model...")
    w = models.KeyedVectors.load_word2vec_format(file, binary=True)
    print("Finished.")
    return w

In [3]:
# import and load glove model
def loadGloveModel(file):
    print("Loading glove model...")
    f = open(file,'r', encoding='utf-8')
    gloveModel = {}
    for line in f:
        splitLines = line.split(' ')
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [None]:
# load word2vec model
word2vec = loadWord2VecModel('data/GoogleNews-vectors-negative300.bin')

In [4]:
# load Glove model
glove = loadGloveModel('data/glove.840B.300d.txt')

Loading glove model...
2196017  words loaded!


In [5]:
def get_tar_att_arrays(model, t1, t2, a1, a2):
    tar1 = np.array([model[vector] for vector in t1])
    tar2 = np.array([model[vector] for vector in t2])
    att1 = np.array([model[vector] for vector in a1])
    att2 = np.array([model[vector] for vector in a2])
    
    return tar1, tar2, att1, att2

In [176]:
def get_matrices(filepath, model):

    data_file = pd.read_csv(filepath, sep=',\s*', engine='python',  header=None, index_col=0)
    
    # get targets and attribute labels
    target_names = list(data_file.index)[:2]
    attribute_names = list(data_file.index)[2:]
    
    # get targets and attribute sets
    targets = data_file.loc[target_names]
    attributes = data_file.loc[attribute_names]
    
    # get arrays, one for each set of target and attribute
    tar1 = targets.loc[target_names[0]]
    tar2 = targets.loc[target_names[1]]
    att1 = attributes.loc[attribute_names[0]]
    att2 = attributes.loc[attribute_names[1]]
    
    # remove any NaN values that have been read due to mismatch of columns 
    tar1 = tar1[~pd.isna(tar1)]
    tar2 = tar2[~pd.isna(tar2)]
    att1 = att1[~pd.isna(att1)]
    att2 = att2[~pd.isna(att2)]
        
    # give numpy array of glove word embeddings for targets and attributes
    tar1, tar2, att1, att2 = get_tar_att_arrays(model, tar1, tar2, att1, att2)
    
    return target_names, attribute_names, tar1, tar2, att1, att2

def output_table(model):
    directory = 'targets_attributes_data/'
    
    # instantiate the weta object
    weat_object = weat.Weat()
    effect_size = list()
    p_value = list()
    targets = list()
    attributes = list()
    
    print('Reading files...\n')

    # loop over all files in the directory
    for filename in os.listdir(directory):
        if not filename.startswith('.'):  # ignore hidden files
            filepath = directory + filename
            print (filepath)

            # retrieve target names and attributes to form the table
            target_names, attribute_names, _, _, _, _ = get_matrices(filepath, model)
            targets.append(target_names)
            attributes.append(attribute_names)

            # retrive the word embeddings for the targets and attributes
            _, _, t1, t2, a1, a2 = get_matrices(filepath, model)
            
            # calculate the effect size 
            effect_size.append(weat_object.effect_size(t1, t2, a1, a2))
            
            # calculate the p-value
            p_value.append(weat_object.p_value(t1, t2, a1, a2))
            
            break
        
    # create a daraframe with the targets, attributes, and effect size
    output_df = pd.DataFrame(data = list(zip(targets, attributes, effect_size, p_value)),
                             columns = ['Targets', 'Attributes', 'Effect Size', 'P-Value'])
    return output_df

In [185]:
importlib.reload(weat)

<module 'lib.weat' from '/Users/adimaini/Documents/GW/Machine Learning/Research/CODE/WEAT-WEFAT/lib/weat.py'>

In [186]:
df_glove = output_table(glove)

Reading files...

targets_attributes_data/instruments vs weapons.csv


IndexError: tuple index out of range

In [150]:
df_glove

Unnamed: 0,Targets,Attributes,Effect Size,P-Value
0,"[Instruments, Weapons]","[Pleasant, Unpleasant]",2.167833,0.0


In [112]:
A = np.random.randint(2, size=(25, 300))
B = np.random.randint(2, size=(25, 300))

In [113]:
A

array([[0, 1, 1, ..., 1, 0, 1],
       [1, 1, 1, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 1, 1, ..., 0, 0, 1]])

In [114]:
B

array([[1, 1, 0, ..., 0, 1, 1],
       [0, 1, 0, ..., 1, 0, 1],
       [0, 1, 1, ..., 1, 0, 1],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 1, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 1, 0]])

In [115]:
A.shape

(25, 300)

In [159]:
B.shape

(25, 300)

In [117]:
mag_a = A/np.linalg.norm(A)
mag_b = B/np.linalg.norm(B)

In [168]:
mag_a.shape

(25, 300)

In [169]:
np.sum(np.matmul(mag_a, mag_b.T), axis=1)

array([0.45241789, 0.57356745, 0.47540385, 0.49893067, 0.48486866,
       0.5403054 , 0.51813071, 0.50136447, 0.51028844, 0.40211919,
       0.46729116, 0.50136447, 0.47053624, 0.54544344, 0.45593339,
       0.51867155, 0.404553  , 0.49135882, 0.49622643, 0.52029409,
       0.56275052, 0.48243486, 0.44809112, 0.47864893, 0.49757855])

TypeError: only integer scalar arrays can be converted to a scalar index

In [78]:
df_glove['P-Value'][0]

0.018540048095223367

In [None]:
len(df_glove['P-Value'][0])

In [None]:
df_word2vec

In [None]:
df_word2vec = output_table(word2vec)

In [None]:
_, _, t1, t2, a1, a2 = get_matrices('targets_attributes_data/instruments vs weapons.csv', glove)

In [None]:
attributes = np.concatenate([a1, a2])

In [None]:
# calculate the zscores
z_scores = scipy.stats.zscore(attributes, axis=0)

# calculate the p-values
p_values = scipy.stats.norm.pdf(abs(z_scores))

In [None]:
p_values.max()

In [None]:
attributes

In [None]:
# df_word2vec.to_csv('output/weat_score_word2vec.csv')

In [None]:
import numpy as np

In [None]:
from sympy.utilities.iterables import multiset_partitions

In [None]:
    idx = np.zeros(6)
    idx[:6 // 2] = 1

In [None]:
idx

In [None]:
for i in multiset_partitions(idx):
    print(i)

In [None]:
import itertools 
for permutation in itertools.islice(itertools.permutations(['cat', 'dog', 'pillow']), 0, 2):
    print(np.array(permutation))

In [None]:
2 / 2

In [None]:
test = ['cat', 'dog', 'fat']
test = np.array(test[:-1])
test

In [None]:
if (test.shape[0] % 2 != False):
    print('yay')

In [None]:
test.shape[0] % 2 == 0