## Data Loading

In [None]:


#data loading
import pandas as pd

predata = pd.read_csv("../data/2transfers_balanced_smorerund.csv", low_memory=False)

In [None]:
#columns to consider for training
predata.columns

In [None]:
#visualize the whole output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import sys
import numpy as np

np.set_printoptions(threshold=sys.maxsize)

## Pre Processing

In [None]:
#begin of preprocessing
import time

start = time.time()

In [None]:
#convert amount and accountbalance to classes and assign a word to each interval  
import numpy as np
#automatic labels
import string


class LabelCategorizer:
    def __init__(self, base_word='cat'):
        self.initial = 1
        self._alphabet_index = 0
        self.base_word = base_word
        self.current_word = self.base_word
        self.shift = 0

    def __str__(self):
        return 'Class: Label Categorizer\nBase word: ' + self.base_word + '\nCurrent Word: ' + self.current_word

    def get_next_word(self):
        if self.initial:
            self.initial = 0
            return self.current_word

        if self.shift > 0:
            self.current_word = self.current_word[-1] + self.current_word[:-1]
            self.shift -= 1
        else:
            self.current_word = self.current_word + string.ascii_lowercase[self._alphabet_index]
            self._alphabet_index = (self._alphabet_index + 1) % len(string.ascii_lowercase)
            self.shift = len(self.current_word) - 1

        return self.current_word


#replacement of the old columns with the new ones with classes
def cutter(target_col, number, word, words_map):
    #make sure that only positives are assigned an interval
    target_col_min = max(predata[target_col].min(), 1)
    target_col_max = max(predata[target_col].max(), 1)

    bins_a = np.geomspace(float(target_col_min), float(target_col_max), num=number)
    bins_a[0] = bins_a[0] - 1
    bins_aux = bins_a[1:]
    bins_aux = np.append(bins_aux, bins_a[-1] + 1)
    bin_tuples = list(zip(bins_a, bins_aux))

    bins = pd.IntervalIndex.from_tuples(bin_tuples)

    #range of the intervals made
    print(bins)

    labels_a = []

    a = LabelCategorizer(base_word=word)

    for _ in range(number):
        labels_a.append(a.get_next_word())

    x = pd.cut(predata[target_col].to_list(), bins=bins)
    x.categories = labels_a
    predata[target_col] = x

    for number_index in range(number):
        words_map[labels_a[number_index]] = bins[number_index]


#columns to apply the conversion
columns = ['amount', 'accountbalance']

#number of intervals for each column
number_bins = [40, 40]

#base words assigned to each column on columns to apply the conversion
base_words = ['pink', 'red']

#get acess to the range of the interval based on the word that appears
values_map = {}

for i in range(len(columns)):
    cutter(columns[i], number_bins[i], base_words[i], values_map)

In [None]:
#assign the word negaccount for negative values of accountbalance  
aux = predata['accountbalance'].values
vacc = []

for elm in aux:
    if str(elm) == 'nan':
        vacc.append('negaccount')
    else:
        vacc.append(elm)

predata['accountbalance'] = vacc

In [None]:
#interval that a word corresponds to
#values_map['red']

In [None]:
#convert hours to classes and assign a word to each interval  
bins_hour = [0, 4, 8, 12, 16, 20, 24]

#labels assigned to each interval
labels_hour = ['dawn', 'earlymorning', 'morning', 'afternoon', 'dusk', 'night']

predata['hour'] = list(
    pd.cut(predata['hour'], bins=bins_hour, labels=labels_hour, retbins=True, include_lowest=True)[0])

In [None]:
#convert trusted_indicator to classes and assign a word to each interval  
bins_ti = [0.0, 0.5, 1.0]

#labels assigned to each interval
labels_ti = ['ntrusted', 'trusted']

predata['trusted_indicator'] = list(
    pd.cut(predata['trusted_indicator'], bins=bins_ti, labels=labels_ti, retbins=True, include_lowest=True)[0])

In [None]:
#add letter before number to distinguish between similar numbers from different columns
cols = ['entity', 'reference', 'iban_orig', 'iban_dest', 'ipaddress', 'clientid', 'week']

identifier = ['e', 'r', 'io', 'id', 'ip', 'c', 'w']

for col in range(len(cols)):
    predata[cols[col]] = predata[cols[col]].apply(lambda x: identifier[col] + str(x))

In [None]:
#convert binary and chain of numbers to specific words bbbb
def apply_map(df, target_col, target_map):
    df[target_col] = df[target_col].apply(lambda x: target_map.get(str(x)))


cols_maps = [('is_fraud', {'0': 'nfraud', '1': 'fraud'}),
             ('weekday', {'0': 'mon', '1': 'tue', '2': 'wed', '3': 'thu', '4': 'fri', '5': 'sat', '6': 'sun'}),
             ('month', {'1': 'jan', '2': 'feb', '3': 'mar', '4': 'apr', '5': 'may', '6': 'jun', '7': 'jul', '8': 'aug',
                        '9': 'sep',
                        '10': 'oct', '11': 'nov', '12': 'dec'})]

for comb in cols_maps:
    apply_map(predata, comb[0], comb[1])

In [None]:
#make fraud column as the center column
new_order = ['canal', 'operativa', 'clientid', 'entity', 'reference', 'trusted_indicator', 'iban_orig', 'iban_dest',
             'amount',
             'is_fraud', 'accountbalance', 'ipaddress', 'browser_family', 'os_family', 'hour', 'week', 'weekday',
             'month',
             'device']

predata = predata[new_order]

In [None]:
#select data for train and test  
from sklearn.model_selection import train_test_split

#target column
y = predata['is_fraud']

#train and test
X_train, X_test, y_train, y_test = train_test_split(predata, y, stratify=y, test_size=0.2)

In [None]:
#for entering the model
sentences = X_train.to_numpy()
sentences_aux = [list(curr) for curr in sentences]
sentences_series = pd.Series(sentences_aux)

In [None]:
#for the test metrics
X_test_np = X_test.copy()
del X_test_np['is_fraud']

sentences_np_test = X_test_np.to_numpy()
sentences_aux_np_test = [list(curr) for curr in sentences_np_test]
sentences_series_np_test = pd.Series(sentences_aux_np_test)

In [None]:
#for the train metrics
X_train_np = X_train.copy()
del X_train_np['is_fraud']

sentences_np_train = X_train_np.to_numpy()
sentences_aux_np_train = [list(curr) for curr in sentences_np_train]
sentences_series_np_train = pd.Series(sentences_aux_np_train)

In [None]:
#get size of the corpus 
token_count = sum([len(sentence) for sentence in sentences_series])
print("This corpus contains {} tokens".format(token_count))

In [None]:
#end of preprocessing
stop = time.time()
print(f"Training time: {stop - start}s")

## Training

In [None]:
#begin of training
begin = time.time()

In [None]:
#callback to print loss after each epoch
import gensim.models.word2vec as w2v
from gensim.models.callbacks import CallbackAny2Vec


class MyGensimCallback(CallbackAny2Vec):

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))

        self.epoch += 1

In [None]:
#inicialization and training word2vec - put equal to grid search
import multiprocessing


def training(sentences, cycles, dim, window, sample, negative, exponent, alpha, min_alpha):
    model = w2v.Word2Vec(
        sg=1,  #skip-gram - fixed
        workers=multiprocessing.cpu_count(),  #use all cores - fixed
        vector_size=dim,  #dimension of the embedding space - change
        window=window,  #words befores and after the center word - change
        sample=sample,  #whithout subsampling - change
        min_count=1,  #use every word - fixed
        negative=negative,  #noise-words - change
        hs=0,  #negative sampling
        ns_exponent=exponent,  #exponent to shape negative sampling - change
        alpha=alpha,  #initial learning rate - change
        min_alpha=min_alpha  #final learning rate - change
    )

    #vocabulary creation
    model.build_vocab(sentences)

    #model training
    model.train(sentences, epochs=cycles, total_examples=model.corpus_count, compute_loss=True,
                callbacks=[MyGensimCallback()])

    return model


#model creation
model = training(sentences_series, 5, 5, 9, 0, 5, 0.75, 0.025, 0.0001)

In [None]:
#info about the trained model 
print(model)

In [None]:
#update z - dictionary that saves the word and its position
def refresh_z(model, k, missingw):
    #looks for the missing word starting from the bottom
    for i in range(len(model.wv.index_to_key) - 1, 0, -1):

        #assigns the new position of the missing word in the vocabulary
        if model.wv.index_to_key[i] == missingw:
            k[missingw] = i

            #from the moment the word is found returns the new dictionary
            return k

    return k

In [None]:
#update probv - vector with the parameters of a transaction

def refresh_probv(model, cid):
    #obtain the representative vector of the clientid
    civ = model.wv.get_vector(cid)

    #multiplication of the clientid vector by the decode matrix (M2)
    m2 = model.syn1neg
    #vector of len = vocab_size
    vout = np.dot(civ, m2.T)

    #apply softmax to the previous vector to obtain the conditional probabilities
    probv = softmax(vout)

    return probv

In [None]:
#prediction method
from scipy.special import softmax

def predict(model, X, threshold, verbose1, verbose2):
    #save the predictions made by the model in a list
    predictions = []

    #TODO: ????
    k = {}
    missingw = ""

    z = refresh_z(model, k, missingw)

    #for each transaction (eval_row)
    for eval_row in X:

        #print transaction parameters if verbose1 True
        if verbose1:
            print(eval_row)

        #cid is always in position 2 of the array
        curr_cid = eval_row[2]

        #if curr_cid is not known
        if curr_cid not in model.wv.index_to_key:
            #update model
            model.build_vocab([[curr_cid]], update=True)
            #update z
            z = refresh_z(model, k, missingw)

        else:
            #for each sentence_series creates a current list
            curr = []

            #filter the parameters associated with the transaction made by the specific clientid (civ)
            for x in eval_row:
                if x != curr_cid:
                    if x in model.wv.index_to_key:
                        curr.append(z[x])

                    else:
                        #if the word is not known
                        #update model
                        model.build_vocab([[x]], update=True)

                        #update z
                        z = refresh_z(model, k, missingw)
                        #TODO: Problem KeyError: 'nbe'
                        curr.append(z[x])

            #sum each value on the array to obtain the final probability
            probv = refresh_probv(model, curr_cid)
            fprob = sum(probv[curr])

            #print results if verbose True
            if verbose2:
                print(f"Array content: {probv[curr]} \t Sum: {fprob} \n")

            #convert the values to binary and append to predictions
            if fprob < threshold:
                predictions.append(0)
            else:
                predictions.append(1)

    return predictions

In [None]:
#grid-search
#save results
import sys
import itertools

old_stdout = sys.stdout
log_file = open("results.log", "w")
sys.stdout = log_file

#create a dictionary for the hyperparameters that are going to vary
cycles = [5, 10, 50, 100]
dim = [5, 50, 150, 300]  #[2, 5, 10, 50, 150, 300]
window = [2, 5, 9]  #[1 - 9]  
negative = [0, 5, 10, 20]  #[0, 5, 10, 15, 20]
exponent = [-1, -0.75, 0, 0.75,
            1]  #1.0 samples exactly in proportion to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more than high-frequency words. The 0.75 was chosen by the original Word2Vec paper. In https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that other values may perform better for recommendation applications
alpha = [0.015, 0.025, 0.035]
min_alpha = [0.0001, 0.0006]
sample = [0, 0.001, 0.00001]

#can try sg = 0 /// hs = 1 (leads to negative = 0)

#train the model with the dictionary
for (c, d, w, n, e, a, m, s) in itertools.product(cycles, dim, window, negative, exponent, alpha, min_alpha, sample):
    curr_model = training(sentences_series, cycles=c, dim=d, window=w, negative=n,
                          exponent=e, alpha=a, min_alpha=m, sample=s)

    #metrics for each combination
    predict(curr_model, sentences_series_np_train, 0.5, verbose1=False, verbose2=False)

#save results 
sys.stdout = old_stdout
log_file.close()

In [None]:
#words in the vocabulary
#model.wv.index_to_key

In [None]:
#model's memory consuming members with their size in bytes
#model.estimate_memory()

In [None]:
#saving the model
#model.save(r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Datasets/3transfers_word2vec_matrix_originalcols.w2v')

In [None]:
#end of training
end = time.time()
print(f"Training time: {end - begin}s")

In [None]:
#option 1
#python 3transfers_word2vec_matrix_originalcols_saving.py &> results.txt


#option 2
# def fprint(output):
#     print output
#     with open("somefile.txt", "a") as f:
#         f.write("{}\n".format(output))


#option 3
# from contextlib import redirect_stdout
#
# with open('results.log', 'w') as f:
#     with redirect_stdout(f):
#         print('generating')
# the rest of your code or main function goes here

In [None]:
#------------------------------------------probability vector for a client----------------------------------------------------#

#weight matrices
m1 = model.wv.vectors
m2 = model.syn1neg  #negative sampling
#m2 = model.syn1       #hierarchical-softmax 

In [None]:
#m2 shape
m2.shape

In [None]:
#apply the prediction method for the train set
predict(model, sentences_series_np_train, 0.5, verbose1=True, verbose2=True)

In [None]:
#true values
y_train

In [None]:
#convert true values in train set to binary
y_train = [1 if elem == "fraud" else 0 for elem in y_train]

In [None]:
#metrics for the train set
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef, confusion_matrix, precision_recall_curve

#accuracy
accuracy = accuracy_score(y_train, predict(model, X_train_np.values, 0.5, verbose1=False, verbose2=False))
print('accuracy: {}'.format(accuracy))

#precision, recall, f-score
precision, recall, fscore, support = precision_recall_fscore_support(y_train, predict(model, X_train_np.values, 0.5,
                                                                                      verbose1=False, verbose2=False),
                                                                     average='micro')

#precision - ratio tp / (tp + fp) - ability not to label a negative sample as positive
print('precision: {}'.format(precision))

#recall - ratio tp / (tp + fn) - ability to find all the positive samples - best is 1, worst is 0
print('recall: {}'.format(recall))

#fscore - weighted harmonic mean of the precision and recall - best is 1, worst is 0
print('fscore: {}'.format(fscore))

#matthews correlation coefficient - measure of the quality of binary classifications
#can be used even if the classes are of very different sizes - is in essence a correlation coefficient between -1 and +1
#+1 means perfect prediction, 0 an average random prediction, -1 an inverse prediction
mcc = matthews_corrcoef(y_train, predict(model, X_train_np.values, 0.5, verbose1=False, verbose2=False))
print('mcc: {}'.format(mcc))

#g-mean - squared root of the product of the sensitivity and specificity - best is 1, worst is 0
print('G-mean:', (geometric_mean_score(y_train, predict(model, X_train_np.values, 0.5, verbose1=False, verbose2=False),
                                       average='micro')))

#specificity - ability to predict true negatives of each available category - recall of the negative class
specifity = tn / (tn + fp)
print('specifity: {}'.format(specifity))

#sensitivity - ability to predict true positives of each available category = recall
#break

#confusion matrix    
print(confusion_matrix(y_train, predict(model, X_train_np.values, 0.5, verbose1=False, verbose2=False)))

#true positives, false positives, true negatives, false negatives
tn, fp, fn, tp = confusion_matrix(y_train,
                                  predict(model, X_train_np.values, 0.5, verbose1=False, verbose2=False)).ravel()

#true negatives
print('true negatives: {}'.format(tn))

#false positives
print('false positives: {}'.format(fp))

#false negatives
print('false negatives: {}'.format(fn))

#false positives
print('true positives: {}'.format(tp))

#error rate
error_rate = 1 - accuracy
print('error rate: {}'.format(error_rate))

#precision-recall curve - compute precision-recall pairs for different probability thresholds
print(precision_recall_curve(y_train, predict(model, X_train_np.values, 0.5, verbose1=False, verbose2=False)))

#roc curve
import matplotlib.pyplot as plt
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_train,
                                         predict(model, X_train_np.values, 0.5, verbose1=False, verbose2=False))

roc_auc = metrics.auc(fpr, tpr)

display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Word2vec')
display.plot()
plt.show()

In [None]:
#apply the prediction method for the test set
predict(model, X_test_np_test, 0.5, verbose1=True, verbose2=True)

In [None]:
#true values
y_test

In [None]:
#convert true values in train set to binary
y_test = [1 if elem == "fraud" else 0 for elem in y_train]

In [None]:
#metrics for the test set 
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef, confusion_matrix, precision_recall_curve

#accuracy
accuracy = accuracy_score(y_test, predict(model, X_test_np_test, 0.5, verbose1=False, verbose2=False))
print('accuracy: {}'.format(accuracy))

#precision, recall, f-score
precision, recall, fscore, support = precision_recall_fscore_support(y_test,
                                                                     predict(model, X_test_np_test, 0.5, verbose1=False,
                                                                             verbose2=False), average='micro')

#precision - ratio tp / (tp + fp) - ability not to label a negative sample as positive
print('precision: {}'.format(precision))

#recall - ratio tp / (tp + fn) - ability to find all the positive samples - best is 1, worst is 0
print('recall: {}'.format(recall))

#fscore - weighted harmonic mean of the precision and recall - best is 1, worst is 0
print('fscore: {}'.format(fscore))

#matthews correlation coefficient - measure of the quality of binary classifications
#can be used even if the classes are of very different sizes - is in essence a correlation coefficient between -1 and +1
#+1 means perfect prediction, 0 an average random prediction, -1 an inverse prediction
mcc = matthews_corrcoef(y_test, predict(model, X_test_np_test, 0.5, verbose1=False, verbose2=False))
print('mcc: {}'.format(mcc))

#g-mean - squared root of the product of the sensitivity and specificity - best is 1, worst is 0
print('G-mean:', (
    geometric_mean_score(y_test, predict(model, X_test_np_test, 0.5, verbose1=False, verbose2=False), average='micro')))

#specificity - ability to predict true negatives of each available category - recall of the negative class
specifity = tn / (tn + fp)
print('specifity: {}'.format(specifity))

#sensitivity - ability to predict true positives of each available category = recall
#break

#confusion matrix    
print(confusion_matrix(y_test, predict(model, X_test_np_test, 0.5, verbose1=False, verbose2=False)))

#true positives, false positives, true negatives, false negatives
tn, fp, fn, tp = confusion_matrix(y_test, predict(model, X_test_np_test, 0.5, verbose1=False, verbose2=False)).ravel()

#true negatives
print('true negatives: {}'.format(tn))

#false positives
print('false positives: {}'.format(fp))

#false negatives
print('false negatives: {}'.format(fn))

#false positives
print('true positives: {}'.format(tp))

#error rate
error_rate = 1 - accuracy
print('error rate: {}'.format(error_rate))

#precision-recall curve - compute precision-recall pairs for different probability thresholds
print(precision_recall_curve(y_test, predict(model, X_test_np_test, 0.5, verbose1=False, verbose2=False)))

#roc curve
import matplotlib.pyplot as plt
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, predict(model, X_test_np_test, 0.5, verbose1=False, verbose2=False))

roc_auc = metrics.auc(fpr, tpr)

display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Word2vec')
display.plot()
plt.show()

In [None]:
#save the threshold and the ratio in a list
def thres_numberf():
    threshold = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    ratio = []

    for thresh in threshold:
        #assign to a current variable the prediction result
        curr = predict(model, X_test_np_test, thresh, verbose1=False, verbose2=False)

        #ratio of the number of detected frauds per the number of real frauds
        ratio.append(sum(curr) / sum(y_test))

    return threshold, ratio

In [None]:
#plot the threshold vs ratio (number of detected frauds/number of real frauds)
import matplotlib.pyplot as plt

#assign the values to plot
thresh, goal = thres_numberf()

plt.plot(thresh, goal, 'b-')
plt.xlabel('threshold')
plt.ylabel('#frauds detected/#real frauds')

#save to csv
thresh_ratio = np.column_stack((thresh.flatten(), goal.flatten()))
np.savetxt('threshold_ratio.csv', thresh_ratio, goal, delimiter=',')

#log scale
plt.yscale('log')

plt.show()

#plt.savefig()

In [None]:
#---------------------------------------------------density matrix------------------------------------------------------------#

#product of the matrices
mproduct = np.matmul(m1, m2.T)
mproduct.shape

In [None]:
#apply softmax to obtain a matrix with conditional probabilities
conditional_probs = softmax(mproduct)

In [None]:
#save density matrix 

#option 1
#np.savetxt('3transfers_word2vec_matrix_originalcols_density_matrix.csv', conditional_probs, delimiter = ',')

#option 2
#pd.DataFrame(conditional_probs).to_csv("3transfers_word2vec_matrix_originalcols_density_matrix.csv", header = None)

In [None]:
#confirm that softmax sum is 1
conditional_probs.sum()

In [None]:
#maximum value of the conditional probabilities
np.max(conditional_probs)

In [None]:
#minimum value of the conditional probabilities
np.min(conditional_probs)

In [None]:
#visualize the 1st 10 lines and the last 10 lines of the density matrix

#1st 10 lines
conditional_aux = conditional_probs[:10].copy()

#last 10 lines
conditional_aux = np.concatenate((conditional_aux, conditional_probs[-10:].copy()))

#display setting
from IPython.core.display import HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [None]:
#shape of the compressed density matrix 
conditional_aux.shape

In [None]:
#print the compressed conditional probabilities matrix
print('\n'.join(['\t'.join([str(cell) for cell in row]) for row in conditional_aux]))

In [None]:
#-------------------------------------------------eigenvalues-----------------------------------------------------------------#

#eingenvalues of the square matrix before softmax (w has the eigenvalues and v the eigenvectors)
w, v = np.linalg.eig(mproduct)  #mproduct is an array of arrays

#separate real and imaginary parts of the eigenvalues
x = w.real  #array

y = w.imag  #array

In [None]:
#eigenvalues
w

In [None]:
#eigenvalues plot for the several vector sizes - square of the numbers in mproduct (product of the matrices before softmax)  
import matplotlib.pyplot as plt


#plot complex numbers
def plot_eigenvalues(model, vec_size):
    x_r = np.square(x)
    y_i = np.square(y)

    plt.plot(x_r, 'b-')
    plt.ylabel('Imaginary')
    plt.xlabel('Real')

    #plt.ylim(0, 250e6)
    #plt.xlim(0, 15)

    fig_name = "vec_size_" + vec_size + "_best_comb.png"
    plt.savefig(fig_name)

    return x_r

In [None]:
#train the model with the optimized parameters for different vector sizes  

#dictionary of results
result_dic = {}

#vector sizes to try
vec_sizes = [3, 4]

#train the model
for curr_vec in vec_sizes:
    curr_model = w2v.Word2Vec(
        sg=1,  #skip-gram
        workers=multiprocessing.cpu_count(),  #use all cores
        vector_size=curr_vec,  #dimension of the embedding space
        window=9,  #words befores and after the center word
        sample=0,  #whithout subsampling
        min_count=1,  #use every word
        negative=5,  #noise-words
        hs=0,  #negative sampling
        ns_exponent=0,  #exponent to shape negative sampling
        alpha=0.025,  #initial learning rate
        min_alpha=0.0001  #final learning rate
    )

    #vocabulary creation
    model.build_vocab(sentences_series)

    #model training
    model.train(sentences_series, epochs=5, total_examples=model.corpus_count, compute_loss=True,
                callbacks=[MyGensimCallback()])

    curr_vec_size = 0
    #save results in the dictionary 
    result_dic[curr_vec_size] = plot_eigenvalues(curr_model, curr_vec)

In [None]:
#option 1 - export result_dic and plot in excel
#convert the dictionary to dataframe
result_dic = pd.DataFrame(data=result_dic, index=[0])
result_dic = (result_dic.T)
result_dic.to_excel('3transfers_word2vec_matrix_originalcols_eingenvalues.xlsx')

#option 2 - plot result_dic with matplotlib
plt.plot(list(result_dic.keys()), list(result_dic.values()))
plt.legend(['3', '4'], loc='upper left')
plt.savefig('3transfers_word2vec_matrix_originalcols_eingenvalues.png')

In [None]:
#------------------------------------------operations with word vectors-------------------------------------------------------#

#topn most similar words 
model.wv.most_similar('fraud')[:10]

#another alternative - same output
#model.wv.similar_by_word('fraud', topn = 10)

In [None]:
#word from the word's list most similar to the 1st word given
model.wv.most_similar_to_given('c29814', ['windows', 'c29814'])

In [None]:
#cosine similarity between two words
model.wv.similarity('nfraud', 'windows')

In [None]:
#cosine similarity between two sets of words
model.wv.n_similarity(['pc', 'windows'], ['tablet', 'android'])

In [None]:
#cosine similarities between one vector and a set of other vectors
c = np.array([0.15340006, 0.2575258, 0.94247705, 0.27604532, -0.5088184])
d = np.array([[0.15340006, 0.2575258, 0.94247705, 0.27604532, -0.5088185],
              [0.15340006, 0.2575258, 0.94247705, 0.27604532, -0.5088186],
              [0.15340006, 0.2575258, 0.94247705, 0.27604532, -1.5088186]])

model.wv.cosine_similarities(c, d)

In [None]:
#cosine distances from given word or vector to all words in other_words
#if other_words is empty it returns the distance between word_or_vectors and all words in vocab
model.wv.distances('pc', other_words=('tablet', 'mobile'))

In [None]:
#cosine distance between two words
model.wv.distance('nfraud', 'windows')

In [None]:
#positive contribute positively towards the similarity and negative keys negatively
#cosine similarity
model.wv.most_similar(positive=['pc', 'windows'], negative=['android'])

In [None]:
#topn most similar words using the multiplicative combination objective
#additional positive or negative examples contribute to the numerator or denominator respectively
#a single positive example is the same as most_similar()
model.wv.most_similar_cosmul(positive=['pc', 'windows'], negative=['android'])

In [None]:
#relative cosine similarity between two words given topn similar words
#1st word - word for which we have to look topn similar word
#2nd word - word for which we are evaluating relative cosine similarity with the 1st word
model.wv.relative_cosine_similarity('windows', 'c29814')

In [None]:
#word from the given list that doesn’t go with the others
model.wv.doesnt_match(['nbe', 'trfint', 'e3', 'r3', 'ntrusted', 'io4', 'id875448', 'ueabbl', 'pinka', 'ip17218224251',
                       'nmobile', 'ntablet', 'pc', 'ntouch', 'nbot', 'chrome', 'windows', 'ncd1', 'nid2796', 'niod1',
                       'nidd0',
                       'ctd690877', 'idtd-1', 'cdarbe', 'eyagbr', 'ncid1', 'nciod1', 'ncidd1', 'd20190814', 'h0',
                       'cfi1'])

In [None]:
#topn most similar keys
#when topn is None the similarities for all words are returned as a one-dimensional numpy array with the size of the vocabulary
model.wv.similar_by_key('windows', topn=None)

In [None]:
#topn most similar keys by vector
a = np.array([0.15340006, 0.2575258, 0.94247705, 0.27604532, -0.5088185])
model.wv.similar_by_vector(a, topn=None)

In [None]:
#rank of the distance of word2 from word1 in relation to distances of all words from the word1
model.wv.rank('c29814', 'windows')

In [None]:
#rank the given words by similarity to the centroid of all the words
model.wv.rank_by_centrality(['windows', 'c29814'])