# Merging Nodes


In [1]:
import pandas as pd
from collections import defaultdict

### Read in Data

In [2]:
sirenNodes = pd.read_csv("siren_network_nodes 121520.csv")
# only need the labels here
sirenNodes = sirenNodes["label"].tolist()
sirenEdges = pd.read_csv("siren_network_edges 121520.csv")
pairs = pd.read_csv("sims_threshold_0999.csv")
pairs = pairs[['index_A', 'index_B', 'simVal']]

In [3]:
sirenNodes[:10]

['whole wheat crispbread',
 'USDA SR sweets (1900)',
 'gruenland cheese',
 'citron melon food product',
 'blueflag plant',
 'CCFAC beverages; excluding dairy products',
 'independent continuant',
 'habanero pepper plant',
 'bullhead',
 'lisita (raw)']

In [4]:
# lets make sure our pairs make sense.
print(sirenNodes[pairs['index_A'][772]])
print(sirenNodes[pairs['index_B'][772]])
# check

rabbit meat (frozen)
rabbit meat food product


In [5]:
pairs[:5]

Unnamed: 0,index_A,index_B,simVal
0,3,18072,1.0
1,8,17720,1.0
2,10,17899,1.0
3,11,2543,1.0
4,14,6717,1.0


# Merging 

In [6]:
# I want to just make pairs a list of tuples
pairs = zip(pairs['index_A'].tolist(), pairs['index_B'].tolist())
pairs = list(pairs)
# now it looks like:
pairs[:5]

[(3, 18072), (8, 17720), (10, 17899), (11, 2543), (14, 6717)]

## Merging Nodes
So  were just making a new set of nodes and a new set of edges and tracking their sources from the old tables.



In [7]:
# the df were gunna write too will have colNames = 'Merge DB Source Node ID', 'Merged Nodes Old ID list', 'Merged_lext, 'Merged Nodes Old Label List'
MergedNodes = []
# so we need to track which nodes we have visited so that we can go back and add the nonVisited ones at the end.
visited = defaultdict(lambda: None)

# first we'll loop through the merged suckers
    # im going to use the shorter label, b/c a lot of them will prob just be one of the labels having info in the 
    # parenthesis, so i will choose to use the one with less info 
def getShorter(i, j, labels):
    if len(labels[i]) < len(labels[j]):
        return i
    else : 
        return j

NodeId = 0
for pair in pairs:
    # get shortest one. 
    # so what happens if we have multiple nodes all trying to merge (WHICH WE WILL)
    # if we find a pair where one has already been merged then we just need to take the one which hasnt. 
    # and merge that.
    # This is quick and dirty (were not checking if both have been merged...)
    if visited[pair[0]] != None:
        # then the first node has already been merged, we just need to add pair[1] to that list of old nodes. 
        MergedNodes[visited[pair[0]]][1].append(pair[1])
        
        # need to also add the label
        MergedNodes[visited[pair[0]]][3].append(sirenNodes[pair[1]])
        
        # mark the other node as visited
        visited[pair[1]] = visited[pair[0]]
    elif visited[pair[1]] != None:
        # add node to list of old nodes
        MergedNodes[visited[pair[1]]][1].append(pair[0])
        
        # need to also add the label
        MergedNodes[visited[pair[1]]][3].append(sirenNodes[pair[0]])
        
        # mark the other node as visited
        visited[pair[0]] = visited[pair[1]]
    else:
        # neither have been visited
        MergedNodes.append([NodeId, list(pair), sirenNodes[getShorter(pair[0], pair[1], sirenNodes)],[sirenNodes[pair[0]],sirenNodes[pair[1]]] ])
        # mark nodes as visited:
        visited[pair[0]] = NodeId
        visited[pair[1]] = NodeId
        # increase our NodeId
        NodeId +=1
                        
# now we should just need to iterate through our labels, and if they havent been visited just add them to our list
# using the same indexing or NodeId.
for i in range(len(sirenNodes)):
    # wthe index is its id number here.
    if visited[i] == None:
        # then we need to add it.
        MergedNodes.append([NodeId, [i], sirenNodes[i], [sirenNodes[i]]])
        # we want to update the visited list here too b/c we will us it as a mapping from old -> new for our edges
        visited[i] = NodeId
        
        NodeId +=1                    


# okay I need to make sure my lists of old nodes are all unique.
def ensureUniqueOldNodes(record):
    return [record[0], list(set(record[1])), record[2], list(set(record[3]))]

MergedNodes = map(ensureUniqueOldNodes, MergedNodes)
                            
mergedNodesDF = pd.DataFrame(MergedNodes, columns=['Merge DB Node ID', 'Merged Nodes Old ID list', 'Merged Label', 'Merged Nodes Old Label List'])    


In [8]:
mergedNodesDF.head(25)

Unnamed: 0,Merge DB Node ID,Merged Nodes Old ID list,Merged Label,Merged Nodes Old Label List
0,0,"[18072, 3]",citron melon plant,"[citron melon food product, citron melon plant]"
1,1,"[8, 17720]",bullhead,"[bullhead, bullhead (raw)]"
2,2,"[10, 17899]",light cream,"[light cream, obsolete: light cream]"
3,3,"[11, 2543]",sangria,"[33770 - sangria (efsa foodex2), sangria]"
4,4,"[6717, 14, 12607]",vegetable shortening,"[vegetable shortening, vegetable shortening (a..."
5,5,"[15968, 14470, 1392, 16784, 21, 11320]",guava (dried),"[guava food product, guava (paste), guava plan..."
6,6,"[24, 8997, 3509]",soursop,"[soursop (whole; raw), soursop plant, soursop]"
7,7,"[2256, 25]",papaya concentrate,[papaya concentrate (nonnutritively sweetened)...
8,8,"[6602, 26, 8277]",cuttlefish,"[cuttlefish, cuttlefish (sliced; seasoned; dri..."
9,9,"[14376, 31]",09660 - cassava roots (efsa foodex2),[09650 - cassava roots and similar- (efsa food...


In [9]:
# Lets see how many we removed:
print("removed : ", len(sirenNodes) - len(mergedNodesDF))
percent = ((len(sirenNodes) - len(mergedNodesDF))/len(sirenNodes)) * 100
print("or : ", percent, ' %')

removed :  5972
or :  27.44611425157406  %


In [10]:
# we can look at some of the merged nodes here.
mergedNodesDF['Merged Nodes Old Label List'][9]

['09650 - cassava roots and similar- (efsa foodex2)',
 '09660 - cassava roots (efsa foodex2)']

In [11]:
# lets find the merged Node with the most old nodes:
mergedNodesDF['mergedCount'] = list(map(lambda x: len(x), mergedNodesDF['Merged Nodes Old Label List']))
mergedNodesDF.sort_values('mergedCount', ascending = False)

Unnamed: 0,Merge DB Node ID,Merged Nodes Old ID list,Merged Label,Merged Nodes Old Label List,mergedCount
559,559,"[8070, 5255, 18567, 10377, 21387, 3854, 4735, ...",fish (preserved),"[fish (raw; breaded), fish product (fresh), fi...",60
88,88,"[1926, 7816, 14861, 4367, 9617, 14866, 13461, ...",beverage food product,"[beverage (carbonated; frozen), beverage (frui...",57
317,317,"[4736, 9473, 9090, 7683, 14208, 3591, 7563, 18...",meat (raw),"[meat product (preserved), meat (mold-fermente...",55
113,113,"[6017, 10372, 2693, 7429, 16263, 7432, 21381, ...",cheese (whipped),"[cheese (sliced), cheese (muenster; for manufa...",49
447,447,"[2178, 11653, 17934, 12303, 15504, 8852, 1429,...",shrimp (breaded),"[shrimp (jumbo; french-fried), shrimp (frozen)...",47
...,...,...,...,...,...
6810,6810,[7197],ceramic or earthenware jar; uncoloured,[ceramic or earthenware jar; uncoloured],1
6811,6811,[7201],25360 - chilean mussel (efsa foodex2),[25360 - chilean mussel (efsa foodex2)],1
6812,6812,[7203],diciandiamide-formaldehyde container,[diciandiamide-formaldehyde container],1
6813,6813,[7205],hard roll,[hard roll],1


In [12]:
mergedNodesDF['Merged Nodes Old Label List'][559]

['fish (raw; breaded)',
 'fish product (fresh)',
 'fish (preserved)',
 'fish food product',
 'fish (defrosted)',
 'fish (kippered)',
 'fish (grilled)',
 'fish (sliced; vegetable added; raw)',
 'fish (canned)',
 'fish (smoked; dried)',
 'fish product (in jelly)',
 'fish (semi-preserved)',
 'fish (grilled; vacuum packed)',
 'fish (fermented)',
 'fish product (fully preserved)',
 'fish (freshwater; raw)',
 'fish (raw; dried)',
 'fish (unprocessed; frozen)',
 'fish product (low sodium)',
 'fish product (preserved)',
 'fish (minced; raw)',
 'fish product (pickled)',
 'fish (gutted; deep-frozen)',
 'fish product (semi-preserved)',
 'fish (salted; dried)',
 'fish (unprocessed; deep-frozen)',
 'fish product (asian; containing spice)',
 'fish (deboned)',
 'fish product (marinated)',
 'fish (raw)',
 '22120 - fish (meat) (efsa foodex2)',
 'fish (smoke-flavored)',
 'fish (prepared)',
 'fish (quick-frozen)',
 'fish (hot process; smoked)',
 'fish product (unspecified species)',
 'fish (pickled)',
 '

In [13]:
# wow thats a lot of fish.
# we dont really want that topic though. 
mergedNodesDF = mergedNodesDF.drop(columns=['mergedCount'])

# Merge Edges

In [14]:
sirenEdges.head(5) # NOTE:  these id's will start at 0, in the xlsx file they start at 1

Unnamed: 0,source,target,source label,label,predicate_type,predicate uri
0,0,18178,whole wheat crispbread,whole- shape achieved by forming- thickness 0....,has quality,
1,0,21456,whole wheat crispbread,fully heat-treated,has quality,
2,0,21457,whole wheat crispbread,seed- skin present- germ present,derives from,
3,2,21458,gruenland cheese,solid,has quality,
4,2,21459,gruenland cheese,partially heat-treated,has quality,


In [15]:
'''
okay so im going to go through the edges.. for each source and target node I need to update the nodeIds.

    I also need to check if that edge already exists, if it does then I dont add it, 
    I will just add that edge id to the list of old edge ids.
'''

'\nokay so im going to go through the edges.. for each source and target node I need to update the nodeIds.\n\n    I also need to check if that edge already exists, if it does then I dont add it, \n    I will just add that edge id to the list of old edge ids.\n'

In [16]:
mergedEdges = [] # ['MergedDB EdgeId', Merge DB Source Node ID', 'Merge DB Target Node ID', 'Old Edge Id List', 'Source Label', 'Target Label']

edgesVisited = defaultdict(lambda: None)
edgeId = 0

for i in range(len(sirenEdges)):
    
    sourceIdMapped = visited[sirenEdges['source'][i]]
    targetIdMapped = visited[sirenEdges['target'][i]]
    if sourceIdMapped == None or targetIdMapped == None:
        raise KeyError('Error! Every Node should be mapped here.')
    # check if edge has aleady been visited:
    if edgesVisited[(sourceIdMapped, targetIdMapped)] != None:
        # then all we need to do is add this edgeId to the list of old edges. 
        mergedEdges[edgesVisited[(sourceIdMapped, targetIdMapped)]][3].append(i)
    else:
        # then we need to create a new edge:
        mergedEdges.append([edgeId, sourceIdMapped, targetIdMapped, [i], mergedNodesDF['Merged Label'][sourceIdMapped], mergedNodesDF['Merged Label'][targetIdMapped]])
        # mark this edge as visited
        edgesVisited[(sourceIdMapped, targetIdMapped)] = edgeId
        # increment edgeId
        edgeId += 1
        
# because we are incrementing through each edge this should be it.

In [17]:
mergedEdgesDF = pd.DataFrame(mergedEdges, columns=['MergedDB EdgeId', 'Merge DB Source Node ID', 'Merge DB Target Node ID', 'Old Edge Id List', 'Source Label', 'Target Label'])    
mergedEdgesDF.head(10)


Unnamed: 0,MergedDB EdgeId,Merge DB Source Node ID,Merge DB Target Node ID,Old Edge Id List,Source Label,Target Label
0,0,2319,809,[0],whole wheat crispbread,whole; shape achieved by forming; thickness <0...
1,1,2319,15560,[1],whole wheat crispbread,fully heat-treated
2,2,2319,15561,[2],whole wheat crispbread,seed; skin present; germ present
3,3,2321,2278,[3],gruenland cheese,solid
4,4,2321,15562,[4],gruenland cheese,partially heat-treated
5,5,2321,15563,[5],gruenland cheese,curd
6,6,2321,5330,[6],gruenland cheese,lactic acid-other agent fermentation process
7,7,2321,15564,[7],gruenland cheese,pasteurization by heating
8,8,2326,2278,[8],lisita (raw),solid
9,9,2326,15565,[9],lisita (raw),not heat-treated


In [18]:
# so we essentially shortened it by:
print("removed : ", len(sirenEdges) - len(mergedEdgesDF))
percent = ((len(sirenEdges) - len(mergedEdgesDF))/len(sirenEdges)) * 100
print("or : ", percent, ' %')

removed :  9416
or :  23.359547495596516  %


### Save our merged DB

In [19]:
# mergedNodesDF.to_csv("MergedDB_Nodes.csv")
# mergedEdgesDF.to_csv("MergedDB_Edges.csv")