# Importing Libraries

In [1]:
import networkx as nx
import os
import pandas as pd
from Mat_Package import Grapher

# Make Giant-Graph

In [2]:
lab_files_path = "./New-Dataset/labels/"
files = os.listdir(lab_files_path)

df = pd.read_csv(lab_files_path+files[0])
G = Grapher.makeGraph(df)
print("Iteration 0")
print("= = = = = = = = = =")

for i in range(1, 100):
    print("Iteration ",i)
    print("= = = = = = = = = =")
    df1 = pd.read_csv(lab_files_path+files[i])
    I_Graph = Grapher.makeGraph(df1)
    G = nx.compose(G, I_Graph)
    


Iteration 0
= = = = = = = = = =
Iteration  1
= = = = = = = = = =
Iteration  2
= = = = = = = = = =
Iteration  3
= = = = = = = = = =
Iteration  4
= = = = = = = = = =
Iteration  5
= = = = = = = = = =
Iteration  6
= = = = = = = = = =
Iteration  7
= = = = = = = = = =
Iteration  8
= = = = = = = = = =
Iteration  9
= = = = = = = = = =
Iteration  10
= = = = = = = = = =
Iteration  11
= = = = = = = = = =
Iteration  12
= = = = = = = = = =
Iteration  13
= = = = = = = = = =
Iteration  14
= = = = = = = = = =
Iteration  15
= = = = = = = = = =
Iteration  16
= = = = = = = = = =
Iteration  17
= = = = = = = = = =
Iteration  18
= = = = = = = = = =
Iteration  19
= = = = = = = = = =
Iteration  20
= = = = = = = = = =
Iteration  21
= = = = = = = = = =
Iteration  22
= = = = = = = = = =
Iteration  23
= = = = = = = = = =
Iteration  24
= = = = = = = = = =
Iteration  25
= = = = = = = = = =
Iteration  26
= = = = = = = = = =
Iteration  27
= = = = = = = = = =
Iteration  28
= = = = = = = = = =
Iteration  29
= = = = = =

# Printing Graph Info

In [3]:
print("No. of Edges in graph are: ", G.number_of_edges())
print("No. of Nodes in graph are: ", G.number_of_nodes())

No. of Edges in graph are:  15543
No. of Nodes in graph are:  3318


# Random-Walk

In [4]:
import random
random.seed(666)

def random_walk(G, start=None, path_length=20, alpha=0, rand=random.Random()):
    '''return a random walk path'''
    if start:
        path = [start]
    else:
        path = [rand.choice(list(G.nodes()))]
    while len(path) < path_length:
        cur = path[-1]
        #find it's neighbors
        if len(G[cur]) > 0:
            if rand.random() >= alpha:
                path.append(rand.choice(list(nx.all_neighbors(G, cur))))
            else:
                path.append(path[0])
        else:
            break
    return path


print(random_walk(G, start=None))


['1.41', '10', 'ORDER', 'PBLBW103271', '|', '51144', '51143', 'CODE', 'BARCODE', 'DESCRIPTION', 'BILLION', 'SHOPPING', ':', '0.7MM', '2B', '/(226721)', '4', 'OLD', 'COST', '0']


In [5]:
def build_deepwalk_corpus(G, num_paths, rand=random.Random()):
    walks = []
    nodes = list(G.nodes())
    for i in range(num_paths):
        rand.shuffle(nodes)
        for node in nodes:
            walks.append(random_walk(G, start=node))
    return walks


# print(build_deepwalk_corpus(G, num_paths=2))


In [6]:
from gensim.models import Word2Vec
corpus = build_deepwalk_corpus(G, num_paths=20)
model = Word2Vec(corpus, window=2, min_count=1, sg=1)
model.build_vocab(corpus)


# Training Model

In [7]:
model.train(corpus, total_examples=len(corpus),
                epochs=30, report_delay=1)


(38689429, 39816000)

# Saving Model

In [9]:

model.save('./Model/Word2Vec_Model.bin')


# Checking Word-Similarity

In [10]:
model.wv.most_similar('Fax')

[('ax', 0.6476998925209045),
 ('563347', 0.6237249970436096),
 ('Vax', 0.6149076223373413),
 ('lax', 0.6008296012878418),
 ('O3-56334718', 0.5693498849868774),
 ('MEHMOOD', 0.5587036609649658),
 ('SELANGOR', 0.5458548665046692),
 ('TABLE', 0.5451949238777161),
 ('BANDAR', 0.5308387279510498),
 ('03-56334718', 0.5241894721984863)]

# Making Word-Embeddings

In [11]:
_embeddings = []
all_Text = []

for i in range(100):
    df = pd.read_csv(lab_files_path+files[i])
    Text = df['Object'].to_list()
    
    for T in Text:
        all_Text.append(T)
    
print("Total text is: ", len(all_Text))
print("Making Embeddings..") 

Nodes = G.nodes()

for N in Nodes:
    try:
        _embeddings.append(model.wv[str(N)])
    except:
        print("Node is: ", str(N))

print(model.wv["Date:"])
# print(_embeddings)

Total text is:  19322
Making Embeddings..
[ 3.85832548e-01  2.67722726e-01  2.59979218e-01 -1.17228609e-02
  6.39384031e-01  3.97768110e-01  1.87548235e-01  7.96595633e-01
 -8.27624261e-01 -9.44695592e-01  1.38199255e-01 -8.79616141e-01
 -1.95675511e-02  5.10129929e-01  4.85848248e-01 -8.04328859e-01
  7.38525927e-01  2.81736344e-01  4.15675014e-01 -1.40328753e+00
  3.35746318e-01 -7.14599490e-01 -8.45685452e-02 -8.02061558e-01
  3.09692472e-01 -6.92559406e-02  3.73226821e-01 -1.34814650e-01
 -7.26907775e-02  6.91083789e-01  9.67317581e-01 -6.73907936e-01
  6.78816974e-01 -7.53034115e-01 -1.39748946e-01  3.58180106e-01
  6.76310062e-01 -5.72651505e-01  5.35904877e-02 -5.68011105e-01
  3.93850684e-01 -3.44660223e-01 -7.74662614e-01 -5.80773473e-01
  2.09576219e-01 -2.92009234e-01 -8.04687589e-02 -6.40103221e-02
  8.44844997e-01  1.25090107e-01 -2.08381861e-02 -9.26100314e-01
  2.12526154e-02 -5.98977983e-01 -3.43913585e-01 -4.38746899e-01
  4.03856367e-01 -1.09733963e+00 -6.67095065e-01

# Testing Saved Model

In [13]:
Emb_Model = "./Model/Word2Vec_Model.bin"
Loaded_model = Word2Vec.load(Emb_Model)
Loaded_model.wv["Date:"]

array([ 3.85832548e-01,  2.67722726e-01,  2.59979218e-01, -1.17228609e-02,
        6.39384031e-01,  3.97768110e-01,  1.87548235e-01,  7.96595633e-01,
       -8.27624261e-01, -9.44695592e-01,  1.38199255e-01, -8.79616141e-01,
       -1.95675511e-02,  5.10129929e-01,  4.85848248e-01, -8.04328859e-01,
        7.38525927e-01,  2.81736344e-01,  4.15675014e-01, -1.40328753e+00,
        3.35746318e-01, -7.14599490e-01, -8.45685452e-02, -8.02061558e-01,
        3.09692472e-01, -6.92559406e-02,  3.73226821e-01, -1.34814650e-01,
       -7.26907775e-02,  6.91083789e-01,  9.67317581e-01, -6.73907936e-01,
        6.78816974e-01, -7.53034115e-01, -1.39748946e-01,  3.58180106e-01,
        6.76310062e-01, -5.72651505e-01,  5.35904877e-02, -5.68011105e-01,
        3.93850684e-01, -3.44660223e-01, -7.74662614e-01, -5.80773473e-01,
        2.09576219e-01, -2.92009234e-01, -8.04687589e-02, -6.40103221e-02,
        8.44844997e-01,  1.25090107e-01, -2.08381861e-02, -9.26100314e-01,
        2.12526154e-02, -