In [3]:
""" uri.py
URIs are Unicode strings that represent the canonical name for any object in
ConceptNet. These can be used with the ConceptNet Web API, or referred to in a
Semantic Web application, by attaching the prefix:

    http://api.conceptnet.io

For example, the English concept "book" has the URI '/c/en/book'. This concept
can be referred to, or retrieved, using this complete URI:

    http://api.conceptnet.io/c/en/book
"""


def standardize_text(text, lowercase=True):
    raise NotImplementedError(
        "This function has been superseded by "
        "conceptnet5.nodes.preprocess_and_tokenize_text."
    )


def join_uri(*pieces):
    """
    `join_uri` builds a URI from constituent pieces that should be joined with
    slashes (/).

    Leading and trailing on the pieces are acceptable, but will be ignored. The
    resulting URI will always begin with a slash and have its pieces separated
    by a single slash.

    The pieces do not have `preprocess_and_tokenize_text` applied to them; to
    make sure your URIs are in normal form, run `preprocess_and_tokenize_text`
    on each piece that represents arbitrary text.

    >>> join_uri('/c', 'en', 'cat')
    '/c/en/cat'

    >>> join_uri('c', 'en', ' spaces ')
    '/c/en/ spaces '

    >>> join_uri('/r/', 'AtLocation/')
    '/r/AtLocation'

    >>> join_uri('/test')
    '/test'

    >>> join_uri('test')
    '/test'

    >>> join_uri('/test', '/more/')
    '/test/more'
    """
    joined = '/' + ('/'.join([piece.strip('/') for piece in pieces]))
    return joined


def concept_uri(lang, text, *more):
    """
    `concept_uri` builds a representation of a concept, which is a word or
    phrase of a particular language, which can participate in relations with
    other concepts, and may be linked to concepts in other languages.

    Every concept has an ISO language code and a text. It may also have a part
    of speech (pos), which is typically a single letter. If it does, it may
    have a disambiguation, a string that distinguishes it from other concepts
    with the same text.

    This function should be called as follows, where arguments after `text`
    are optional:

        concept_uri(lang, text, pos, disambiguation...)

    `text` and `disambiguation` should be strings that have already been run
    through `preprocess_and_tokenize_text`.

    This is a low-level interface. See `standardized_concept_uri` in nodes.py for
    a more generally applicable function that also deals with special
    per-language handling.

    >>> concept_uri('en', 'cat')
    '/c/en/cat'
    >>> concept_uri('en', 'cat', 'n')
    '/c/en/cat/n'
    >>> concept_uri('en', 'cat', 'n', 'feline')
    '/c/en/cat/n/feline'
    >>> concept_uri('en', 'this is wrong')
    Traceback (most recent call last):
        ...
    AssertionError: 'this is wrong' is not in normalized form
    """
    assert ' ' not in text, "%r is not in normalized form" % text
    if len(more) > 0:
        if len(more[0]) != 1:
            # We misparsed a part of speech; everything after the text is
            # probably junk
            more = []
        for dis1 in more[1:]:
            assert ' ' not in dis1, "%r is not in normalized form" % dis1

    return join_uri('/c', lang, text, *more)


def compound_uri(op, args):
    """
    Some URIs represent a compound structure or operator built out of a number
    of arguments. Some examples are the '/and' and '/or' operators, which
    represent a conjunction or disjunction over two or more URIs, which may
    themselves be compound URIs; or the assertion structure, '/a', which takes
    a relation and two URIs as its arguments.

    This function takes the main 'operator', with the slash included, and an
    arbitrary number of arguments, and produces the URI that represents the
    entire compound structure.

    These structures contain square brackets as segments, which look like
    `/[/` and `/]/`, so that compound URIs can contain other compound URIs
    without ambiguity.

    >>> compound_uri('/nothing', [])
    '/nothing/[/]'
    >>> compound_uri('/a', ['/r/CapableOf', '/c/en/cat', '/c/en/sleep'])
    '/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]'
    """
    items = [op]
    first_item = True
    items.append('[')
    for arg in args:
        if first_item:
            first_item = False
        else:
            items.append(',')
        items.append(arg)
    items.append(']')
    return join_uri(*items)


def split_uri(uri):
    """
    Get the slash-delimited pieces of a URI.

    >>> split_uri('/c/en/cat/n/animal')
    ['c', 'en', 'cat', 'n', 'animal']
    >>> split_uri('/')
    []
    """
    if not uri.startswith('/'):
        return [uri]
    uri2 = uri.lstrip('/')
    if not uri2:
        return []
    return uri2.split('/')


def uri_prefix(uri, max_pieces=3):
    """
    Strip off components that might make a ConceptNet URI too detailed. Only
    the first `max_pieces` components will be kept.

    By default, `max_pieces` is 3, making this function useful for converting
    disambiguated concepts into their more general ambiguous forms.

    If the URI is actually a fully qualified URL, no components are removed.

    >>> uri_prefix('/c/en/cat/n/animal')
    '/c/en/cat'
    >>> uri_prefix('/c/en/cat/n')
    '/c/en/cat'
    >>> uri_prefix('/c/en/cat')
    '/c/en/cat'
    >>> uri_prefix('/c/en')
    '/c/en'
    >>> uri_prefix('/c/en/cat', 2)
    '/c/en'
    >>> uri_prefix('http://en.wikipedia.org/wiki/Example')
    'http://en.wikipedia.org/wiki/Example'
    """
    if is_absolute_url(uri):
        return uri
    pieces = split_uri(uri)[:max_pieces]
    return join_uri(*pieces)


def uri_prefixes(uri, min_pieces=2):
    """
    Get URIs that are prefixes of a given URI: that is, they begin with the
    same path components. By default, the prefix must have at least 2
    components.

    If the URI has sub-parts that are grouped by square brackets, then
    only complete sub-parts will be allowed in prefixes.

    >>> list(uri_prefixes('/c/en/cat/n/animal'))
    ['/c/en', '/c/en/cat', '/c/en/cat/n', '/c/en/cat/n/animal']
    >>> list(uri_prefixes('/test/[/group/one/]/[/group/two/]'))
    ['/test/[/group/one/]', '/test/[/group/one/]/[/group/two/]']
    >>> list(uri_prefixes('http://en.wikipedia.org/wiki/Example'))
    ['http://en.wikipedia.org/wiki/Example']
    """
    if is_absolute_url(uri):
        return [uri]
    pieces = []
    prefixes = []
    for piece in split_uri(uri):
        pieces.append(piece)
        if len(pieces) >= min_pieces:
            if pieces.count('[') == pieces.count(']'):
                prefixes.append(join_uri(*pieces))
    return prefixes


def parse_compound_uri(uri):
    """
    Given a compound URI, extract its operator and its list of arguments.

    >>> parse_compound_uri('/nothing/[/]')
    ('/nothing', [])
    >>> parse_compound_uri('/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]')
    ('/a', ['/r/CapableOf', '/c/en/cat', '/c/en/sleep'])
    >>> parse_compound_uri('/or/[/and/[/s/one/,/s/two/]/,/and/[/s/three/,/s/four/]/]')
    ('/or', ['/and/[/s/one/,/s/two/]', '/and/[/s/three/,/s/four/]'])
    """
    pieces = split_uri(uri)
    if pieces[-1] != ']':
        raise ValueError("Compound URIs must end with /]")
    if '[' not in pieces:
        raise ValueError(
            "Compound URIs must contain /[/ at the beginning of the argument list"
        )
    list_start = pieces.index('[')
    op = join_uri(*pieces[:list_start])

    chunks = []
    current = []
    depth = 0

    # Split on commas, but not if they're within additional pairs of brackets.
    for piece in pieces[(list_start + 1) : -1]:
        if piece == ',' and depth == 0:
            chunks.append('/' + ('/'.join(current)).strip('/'))
            current = []
        else:
            current.append(piece)
            if piece == '[':
                depth += 1
            elif piece == ']':
                depth -= 1

    assert depth == 0, "Unmatched brackets in %r" % uri
    if current:
        chunks.append('/' + ('/'.join(current)).strip('/'))
    return op, chunks


def parse_possible_compound_uri(op, uri):
    """
    The AND and OR conjunctions can be expressed as compound URIs, but if they
    contain only one thing, they are returned as just that single URI, not a
    compound.

    This function returns the list of things in the compound URI if its operator
    matches `op`, or a list containing the URI itself if not.

    >>> parse_possible_compound_uri(
    ...    'or', '/or/[/and/[/s/one/,/s/two/]/,/and/[/s/three/,/s/four/]/]'
    ... )
    ['/and/[/s/one/,/s/two/]', '/and/[/s/three/,/s/four/]']
    >>> parse_possible_compound_uri('or', '/s/contributor/omcs/dev')
    ['/s/contributor/omcs/dev']
    """
    if uri.startswith('/' + op + '/'):
        return parse_compound_uri(uri)[1]
    else:
        return [uri]


def conjunction_uri(*sources):
    """
    Make a URI representing a conjunction of sources that work together to provide
    an assertion. The sources will be sorted in lexicographic order.

    >>> conjunction_uri('/s/contributor/omcs/dev')
    '/s/contributor/omcs/dev'

    >>> conjunction_uri('/s/rule/some_kind_of_parser', '/s/contributor/omcs/dev')
    '/and/[/s/contributor/omcs/dev/,/s/rule/some_kind_of_parser/]'
    """
    if len(sources) == 0:
        # Logically, a conjunction with 0 inputs represents 'True', a
        # proposition that cannot be denied. This could be useful as a
        # justification for, say, mathematical axioms, but when it comes to
        # ConceptNet, that kind of thing makes us uncomfortable and shouldn't
        # appear in the data.
        raise ValueError("Conjunctions of 0 things are not allowed")
    elif len(sources) == 1:
        return sources[0]
    else:
        return compound_uri('/and', sorted(set(sources)))


def assertion_uri(rel, start, end):
    """
    Make a URI for an assertion, as a compound URI of its relation, start node,
    and end node.

    >>> assertion_uri('/r/CapableOf', '/c/en/cat', '/c/en/sleep')
    '/a/[/r/CapableOf/,/c/en/cat/,/c/en/sleep/]'
    """
    assert rel.startswith('/r'), rel
    return compound_uri('/a', (rel, start, end))


def is_concept(uri):
    """
    >>> is_concept('/c/sv/klänning')
    True
    >>> is_concept('/x/en/ly')
    False
    >>> is_concept('/a/[/r/Synonym/,/c/ro/funcția_beta/,/c/en/beta_function/]')
    False
    """
    return uri.startswith('/c/')


def is_relation(uri):
    """
    >>> is_relation('/r/IsA')
    True
    >>> is_relation('/c/sv/klänning')
    False
    """
    return uri.startswith('/r/')


def is_term(uri):
    """
    >>> is_term('/c/sv/kostym')
    True
    >>> is_term('/x/en/ify')
    True
    >>> is_term('/a/[/r/RelatedTo/,/c/en/cake/,/c/en/flavor/]')
    False
    """
    return uri.startswith('/c/') or uri.startswith('/x/')


def is_absolute_url(uri):
    """
    We have URLs pointing to Creative Commons licenses, starting with 'cc:',
    which for Linked Data purposes are absolute URLs because they'll be resolved
    into full URLs.

    >>> is_absolute_url('http://fr.wiktionary.org/wiki/mįkká’e_uxpáðe')
    True
    >>> is_absolute_url('/c/fr/nouveau')
    False
    """
    return uri.startswith('http') or uri.startswith('cc:')


def get_uri_language(uri):
    """
    Extract the language from a concept URI. If the URI points to an assertion,
    get the language of its first concept.

    >>> get_uri_language('/a/[/r/RelatedTo/,/c/en/orchestra/,/c/en/symphony/]')
    'en'
    >>> get_uri_language('/c/pl/cześć')
    'pl'
    >>> get_uri_language('/x/en/able')
    'en'
    """
    if uri.startswith('/a/'):
        return get_uri_language(parse_possible_compound_uri('a', uri)[1])
    elif is_term(uri):
        return split_uri(uri)[1]
    else:
        return None


def uri_to_label(uri):
    """
    Convert a ConceptNet uri into a label to be used in nodes. This function
    replaces an underscore with a space, so while '/c/en/example' will be
    converted into 'example', '/c/en/canary_islands' will be converted into
    'canary islands'.

    >>> uri_to_label('/c/en/example')
    'example'
    >>> uri_to_label('/c/en/canary_islands')
    'canary islands'
    >>> uri_to_label('/c/en')
    ''
    >>> uri_to_label('/r/RelatedTo')
    'RelatedTo'
    >>> uri_to_label('http://wikidata.dbpedia.org/resource/Q89')
    'Q89'
    """
    if is_absolute_url(uri):
        return uri.split('/')[-1].replace('_', ' ')
    if is_term(uri):
        uri = uri_prefix(uri)
    parts = split_uri(uri)
    if len(parts) < 3 and not is_relation(uri):
        return ''
    return parts[-1].replace('_', ' ')


class Licenses:
    cc_attribution = 'cc:by/4.0'
    cc_sharealike = 'cc:by-sa/4.0'

In [4]:
from scipy import sparse
import numpy as np
import re 
from collections import defaultdict
import csv


class SparseMatrixBuilder:
    """
    SparseMatrixBuilder is a utility class that helps build a matrix of
    unknown shape.
    """

    def __init__(self):
        self.row_index = []
        self.col_index = []
        self.values = []

    def __setitem__(self, key, val):
        row, col = key
        self.add(row, col, val)

    def add(self, row, col, val):
        self.row_index.append(row)
        self.col_index.append(col)
        self.values.append(val)

    def tocsr(self, shape, dtype=float):
        return sparse.coo_matrix(
            (self.values, (self.row_index, self.col_index)), shape=shape, dtype=dtype
        ).tocsr()
DOUBLE_DIGIT_RE = re.compile(r'[0-9][0-9]')
DIGIT_RE = re.compile(r'[0-9]')
def replace_numbers(s):
    """
    Replace digits with # in any term where a sequence of two digits appears.

    This operation is applied to text that passes through word2vec, so we
    should match it.
    """
    if DOUBLE_DIGIT_RE.search(s):
        return DIGIT_RE.sub('#', s)
    else:
        return s
 
def concept_is_bad(uri):
    """
    Skip concepts that are unlikely to be useful.

    A concept containing too many underscores is probably a long, overly
    specific phrase, possibly mis-parsed. A concept with a colon is probably
    detritus from a wiki.
    """
    return (
        ':' in uri
        or uri.count('_') >= 3
        or uri.startswith('/a/')
        or uri.count('/') <= 2
    )
def is_negative_relation(rel):
    """
    Negative relations describe ways that concepts are different or unrelated.
    In cases where we our goal is to determine how related concepts are, such
    as conceptnet5.builders.reduce_assoc, we should disregard negative
    relations.
    """
    return rel.startswith('/r/Not') or rel == '/r/Antonym' or rel == '/r/DistinctFrom'
class ConceptNetAssociationGraph:
    '''
    Class to hold the concept-association edge graph.
    '''

    def __init__(self):
        '''Construct a graph with no vertices or edges.'''
        self.vertex_to_neighbors = defaultdict(set)

    def add_edge(self, left, right, value, dataset, relation):
        '''Insert an edge in the graph.'''
        self.vertex_to_neighbors[left].add(right)
        self.vertex_to_neighbors[right].add(left)
        return

    def vertices(self):
        '''Returns an iterator over the vertices of the graph.'''
        return self.vertex_to_neighbors.keys()

    def find_components(self):
        '''
        Returns a dict mapping the vertices of the graph to labels,
        such that two vertices map to the same label if and only if
        they belong to the same connected component of the undirected
        graph obtained by adding the reversal of every edge to the
        graph.  (But note that this function does not modify the graph,
        i.e. it does not add any edges.)
        '''

        component_labels = {vertex: -1 for vertex in self.vertices()}
        vertices_to_examine = set(self.vertices())
        new_label = -1
        while len(vertices_to_examine) > 0:
            new_label += 1
            vertex = vertices_to_examine.pop()
            assert component_labels[vertex] == -1
            stack = [vertex]
            component_labels[vertex] = new_label
            while len(stack) > 0:
                v = stack.pop()
                for neighbor in self.vertex_to_neighbors[v]:
                    if component_labels[neighbor] != new_label:
                        assert component_labels[neighbor] == -1
                        component_labels[neighbor] = new_label
                        vertices_to_examine.discard(neighbor)
                        stack.append(neighbor)

        return component_labels

    @classmethod
    def from_csv(cls, filename, filtered_concepts=None, reject_negative_relations=True):
        """
        Reads an association file and builds an (undirected) graph from it.

        If filtered_concepts isn't None, it should be a collection of concepts,
        and only vertices from this collection and edges that link two such
        vertices will be added to the graph.  If it _is_ None (the default),
        however, please note that no such filtering will be done (i.e. the
        effective filter collection is then the universal set of concepts, not
        the empty set).

        If reject_negative_relations is True (the default), only edges not
        corresponding to negative relations will be added to the graph.
        """
        graph = cls()

        if filtered_concepts is None:
            filter_concepts = False
        else:
            filter_concepts = True

        with open(filename, encoding='utf-8') as file:
            for line in file:
                for row in csv.reader([line]):
                    try:
                        left = row[0].strip()
                        right = row[1].strip()
                        value = float(row[2].strip())
                        dataset = row[3].strip()
                        rel = row[4].strip()
                    except (ValueError, IndexError):
                        continue

                if concept_is_bad(left) or concept_is_bad(right):
                    continue
                if reject_negative_relations and is_negative_relation(rel):                    
                    continue
                
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if fvalue == 0:
                    continue
                if gleft == gright:
                    continue
                if filter_concepts and gleft not in filtered_concepts:
                    continue
                if filter_concepts and gright not in filtered_concepts:
                    continue
                graph.add_edge(gleft, gright, value, dataset, rel)

        return graph
       
class ConceptNetAssociationGraphForPropagation(ConceptNetAssociationGraph):
    """
    Subclass of ConceptNetAssociationGraph specialized for use in making
    the full graph of a set of associations as required for propagation.
    """

    def __init__(self):
        super().__init__()
        self.edges = set()

    def add_edge(self, left, right, value, dataset, relation):
        """
        In addition to the superclass's handling of a new edge,
        saves the edges as a set of (left, right) pairs.
        """
        # Use URIs that have the additional standardization for vector-space labels,
        # replacing sequences of digits with the # sign.
        left = replace_numbers(left)
        right = replace_numbers(right)
        super().add_edge(left, right, value, dataset, relation)
        self.edges.add((left, right))
        self.edges.add((right, left))  # save undirected edges

def make_adjacency_matrix(assoc_filename, embedding_vocab):
    """
    Build a sparse adjacency matrix for the ConceptNet graph presented
    in the given assoc file, including all terms from the given embedding
    vocabulary and removing all terms from connected components of the graph
    that do not overlap that vocabulary.

    Also builds an index giving all terms from the resulting joined
    graph+embedding vocabulary in the order corresponding to the rows and
    columns of the matrix.  Note that it is guaranteed that the terms from
    the embedding vocabulary will preceed the remaining terms in that index,
    and that among the remaining terms the terms in English will follow all
    the others.

    Returns the matrix and index, and the number of new English terms.
    """
    # First eliminate all connected components of the graph that don't
    # overlap the vocabulary of the embedding; we can't do anything with
    # those terms.

    graph = ConceptNetAssociationGraphForPropagation.from_csv(
        assoc_filename, reject_negative_relations=False
    )
    component_labels = graph.find_components()

    # Get the labels of components that overlap the embedding vocabulary.
    good_component_labels = set(
        label for term, label in component_labels.items() if term in embedding_vocab
    )

    # Now get the concepts in those components.
    good_concepts = set(
        term
        for term, label in component_labels.items()
        if label in good_component_labels
    )

    del component_labels, good_component_labels

    # Put terms from the embedding first, then terms from the good part
    # of the graph neither from the embedding nor in English, then terms
    # from the good part of the graph in English but not from the embedding.
    #
    # (In the corner case where either of these addtional sets of terms is
    # empty, construction of a pandas index will fail using generator rather
    # than list comprehensions.)
    new_vocab = good_concepts - set(embedding_vocab)
    good_concepts = embedding_vocab.append(
        pd.Index([term for term in new_vocab if get_uri_language(term) != 'en'])
    )
    n_good_concepts_not_new_en = len(good_concepts)
    good_concepts = good_concepts.append(
        pd.Index([term for term in new_vocab if get_uri_language(term) == 'en'])
    )
    del new_vocab
    n_new_english = len(good_concepts) - n_good_concepts_not_new_en

    good_concepts_map = {term: i for i, term in enumerate(good_concepts)}

    # Convert the good part of the graph to an adjacency matrix representation.

    # Note: the edges added differ slightly from the way it is done in (e.g.)
    # build_from_conceptnet_table (in sparse_matrix_builder.py), in that we
    # do not add edges linking specific senses of terms to their more general
    # forms (as defined by uri_prefixes).  Currently no such specific senses
    # show up in the input to retrofitting (i.e. the output of
    # build_from_conceptnet_table), so it doesn't matter, but in the future
    # we may want to add such edges here as well.

    builder = SparseMatrixBuilder()
    for v, w in graph.edges:
        try:
            index0 = good_concepts_map[v]
            index1 = good_concepts_map[w]
            builder[index0, index1] = 1
        except KeyError:
            pass  # one of v, w wasn't good
    del graph

    adjacency_matrix = builder.tocsr(
        shape=(len(good_concepts), len(good_concepts)), dtype=np.int8
    )

    return adjacency_matrix, good_concepts, n_new_english


In [5]:
import numpy as np
from scipy.sparse import diags

def propagate(
    combined_index, embedding, adjacency_matrix, n_new_english, iterations=20
):
    """
    For as many non-English terms as possible in the ConceptNet graph whose
    edges are presented in the given adjacency matrix (with corresponding term
    labels in the given index), find a vector in the target space of the vector
    embedding presented in the given embedding file.
    """

    # Propagate the vectors from the embeddings to the remaining
    # terms, following the edges of the graph.

    embedding_dimension = embedding.values.shape[1]
    new_vocab_size = len(combined_index) - embedding.values.shape[0]
    vectors = np.vstack(
        [
            embedding.values,
            np.zeros(
                (new_vocab_size, embedding_dimension), dtype=embedding.values.dtype
            ),
        ]
    )

    for iteration in range(iterations):
        zero_indicators = np.abs(vectors).sum(1) == 0
        if not np.any(zero_indicators):
            break
        # Find terms with zero vectors having neighbors with nonzero vectors.
        nonzero_indicators = np.logical_not(zero_indicators)
        fringe = adjacency_matrix.dot(nonzero_indicators.astype(np.int8)) != 0
        fringe = np.logical_and(fringe, zero_indicators)
        # Update each as the average of its nonzero neighbors
        adjacent_nonzeros = adjacency_matrix[fringe, :].dot(
            diags([nonzero_indicators.astype(np.int8)], [0], format='csc')
        )
        n_adjacent_nonzeros = adjacent_nonzeros.sum(axis=1).A[:, 0]
        weights = 1.0 / n_adjacent_nonzeros
        vectors[fringe, :] = adjacency_matrix[fringe, :].dot(vectors)
        vectors[fringe, :] = diags([weights], [0], format='csr').dot(vectors[fringe, :])

    n_old_plus_new_non_en = len(combined_index) - n_new_english
    result = pd.DataFrame(
        index=combined_index[0:n_old_plus_new_non_en],
        data=vectors[0:n_old_plus_new_non_en, :],
    )
    return result

In [None]:
import json
import re
import csv
import pandas as pd

input_file_path = '../../conceptnet-assertions-5.7.0.csv/assertions.csv'
output_file_path = 'filtered_edges.csv'

with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['left', 'right', 'value', 'dataset', 'rel'])
    
    # Read each line of the file
    for line in infile:
        parts = line.split("	") 
        if len(parts) < 2:
            continue  
        
        uri = parts[0] 
        relation = parts[1]  
        start = parts[2] 
        end =  parts[3] 
        jsonStruct =  parts[4]      
        
        if "/c/en/" in start or "/c/en/" in start:
            data = json.loads(jsonStruct)
            writer.writerow([start, end, data['weight'], data['dataset'], relation])

In [8]:
import pandas as pd

df = pd.read_csv("filtered_edges.csv",  on_bad_lines='skip')

df

Unnamed: 0,left,right,value,dataset,rel
0,/c/en/0/n,/c/en/1,1.0,/d/wiktionary/fr,/r/Antonym
1,/c/en/12_hour_clock/n,/c/en/24_hour_clock,1.0,/d/wiktionary/en,/r/Antonym
2,/c/en/24_hour_clock/n,/c/en/12_hour_clock,1.0,/d/wiktionary/en,/r/Antonym
3,/c/en/5/n,/c/en/3,1.0,/d/wiktionary/en,/r/Antonym
4,/c/en/a.c/n,/c/en/d.c,1.0,/d/wiktionary/fr,/r/Antonym
...,...,...,...,...,...
6356315,/c/en/xerox,/c/en/projector,0.5,/d/dbpedia/en,/r/dbpedia/product
6356316,/c/en/zanella,/c/en/moped,0.5,/d/dbpedia/en,/r/dbpedia/product
6356317,/c/en/zanella,/c/en/motorcycle,0.5,/d/dbpedia/en,/r/dbpedia/product
6356318,/c/en/zara/n/wp/retailer,/c/en/clothing,0.5,/d/dbpedia/en,/r/dbpedia/product


In [9]:
df.to_csv('out.csv', index=False)

In [28]:
def load_hdf(filename):
    """
    Load a semantic vector space from an HDF5 file.

    HDF5 is a complex format that can contain many instances of different kinds
    of data. The convention we use is that the file contains one labeled
    matrix, named "mat".
    """
    return pd.read_hdf(filename, 'mat', encoding='utf-8')

embedding = load_hdf("../test_retrofitted") 
embedding_vocab = embedding.index 


assoc_filename = "out.csv"
adjacency_matrix, combined_index, n_new_english = make_adjacency_matrix(assoc_filename, embedding_vocab)
    

In [29]:
adjacency_matrix

<2323143x2323143 sparse matrix of type '<class 'numpy.int8'>'
	with 8681854 stored elements in Compressed Sparse Row format>

In [31]:
embedding.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/chair_meeting,-2.129197e-16,-2.004567e-07,3.139843e-07,-6.492713e-08,1.334435e-16,-3.828865e-16,-6.465064000000001e-17,-2.960055e-16,4.2038440000000005e-17,-1.789314e-17,...,0.029592,-0.028524,0.038507,-0.051363,0.052692,-0.054144,0.052064,-0.045782,-0.061692,-0.096738
/c/en/chairperson,-2.069472e-16,-1.889602e-07,2.959767e-07,-6.120344e-08,1.127873e-16,-3.864099e-16,-5.23309e-17,-3.366037e-16,3.7787550000000004e-17,-2.999062e-17,...,0.027895,-0.026888,0.036298,-0.048417,0.04967,-0.051038,0.049078,-0.043156,-0.058154,-0.09119
/c/en/chair,-2.000913e-16,-1.983455e-07,3.106773e-07,-6.424331e-08,-8.366849000000001e-17,-3.044844e-16,2.7266750000000003e-17,-5.378111e-16,2.993259e-17,-9.740842e-17,...,0.029281,-0.028224,0.038101,-0.050822,0.052137,-0.053573,0.051516,-0.0453,-0.061042,-0.09572
/c/en/chairperson/n,-2.004085e-16,-1.867657e-07,2.925394e-07,-6.049266e-08,-5.062227e-17,-3.278182e-16,9.696096e-18,-4.912406e-16,3.229945e-17,-7.964047000000001e-17,...,0.027571,-0.026576,0.035877,-0.047855,0.049093,-0.050446,0.048508,-0.042655,-0.057479,-0.090131
/c/en/president/n/wn/person,-8.483401e-18,-1.530221e-07,2.396853e-07,-4.956326e-08,-7.436574e-16,-6.092042e-15,-8.714433e-16,-4.394232e-15,6.218402e-16,4.450388e-15,...,0.02259,-0.021774,0.029395,-0.039209,0.040223,-0.041332,0.039744,-0.034948,-0.047094,-0.073847


In [32]:
result = propagate(
    combined_index, embedding, adjacency_matrix, n_new_english, iterations=20
)
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/chair_meeting,-2.129197e-16,-2.004567e-07,3.139843e-07,-6.492713e-08,1.334435e-16,-3.828865e-16,-6.465064000000001e-17,-2.960055e-16,4.2038440000000005e-17,-1.789314e-17,...,0.029592,-0.028524,0.038507,-0.051363,0.052692,-0.054144,0.052064,-0.045782,-0.061692,-0.096738
/c/en/chairperson,-2.069472e-16,-1.889602e-07,2.959767e-07,-6.120344e-08,1.127873e-16,-3.864099e-16,-5.23309e-17,-3.366037e-16,3.7787550000000004e-17,-2.999062e-17,...,0.027895,-0.026888,0.036298,-0.048417,0.04967,-0.051038,0.049078,-0.043156,-0.058154,-0.09119
/c/en/chair,-2.000913e-16,-1.983455e-07,3.106773e-07,-6.424331e-08,-8.366849000000001e-17,-3.044844e-16,2.7266750000000003e-17,-5.378111e-16,2.993259e-17,-9.740842e-17,...,0.029281,-0.028224,0.038101,-0.050822,0.052137,-0.053573,0.051516,-0.0453,-0.061042,-0.09572
/c/en/chairperson/n,-2.004085e-16,-1.867657e-07,2.925394e-07,-6.049266e-08,-5.062227e-17,-3.278182e-16,9.696096e-18,-4.912406e-16,3.229945e-17,-7.964047000000001e-17,...,0.027571,-0.026576,0.035877,-0.047855,0.049093,-0.050446,0.048508,-0.042655,-0.057479,-0.090131
/c/en/president/n/wn/person,-8.483401e-18,-1.530221e-07,2.396853e-07,-4.956326e-08,-7.436574e-16,-6.092042e-15,-8.714433e-16,-4.394232e-15,6.218402e-16,4.450388e-15,...,0.02259,-0.021774,0.029395,-0.039209,0.040223,-0.041332,0.039744,-0.034948,-0.047094,-0.073847


In [33]:
diff1 = pd.concat([result, embedding]).drop_duplicates(keep=False)
diff1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/de/verflechten,1.165642e-16,-1.401300e-06,-1.964500e-07,1.257201e-07,1.929672e-16,-1.420060e-16,-1.816806e-16,-9.991165e-18,7.599815e-18,6.863060e-17,...,0.030874,-0.029182,0.022136,-0.040591,0.045733,-0.055873,0.054786,-0.048652,-0.064684,-0.101985
/c/gd/tost_geal,1.987298e-16,3.377577e-05,-1.186720e-05,1.642879e-05,-7.791531e-15,5.764029e-15,6.711658e-15,7.940348e-15,-1.166029e-15,3.713360e-15,...,-0.051047,-0.042544,0.031218,0.114923,0.018571,-0.042055,0.052759,-0.043115,-0.061147,-0.036493
/c/ru/осторожный,-6.373484e-17,-7.143095e-07,2.947653e-06,7.169659e-07,3.828752e-16,-4.195293e-17,-2.410384e-16,-5.321990e-16,3.760020e-16,-5.890107e-16,...,-0.004821,-0.034773,0.035106,0.027656,0.046567,-0.000695,0.053960,0.195646,-0.041995,-0.098684
/c/tr/acil,8.302191e-17,4.772473e-06,4.615884e-06,-2.601526e-07,-2.876905e-15,1.288548e-15,2.358437e-15,-9.990318e-16,4.368428e-16,-1.655957e-15,...,0.031403,0.003131,0.033343,0.048443,0.039941,-0.045000,0.048132,0.006675,-0.027617,-0.066860
/c/ga/iondúil,1.075489e-17,-7.396627e-07,-7.441649e-07,3.864260e-07,-1.202492e-16,-1.132471e-17,7.708410e-18,-4.189398e-17,-9.059036e-17,-8.593039e-17,...,0.029731,0.053838,0.033407,0.062806,0.034938,-0.049869,0.052862,-0.041310,0.003734,-0.074966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/c/gl/ler,9.624583e-17,-2.845818e-06,-8.588077e-07,2.685360e-07,4.596128e-16,-2.523832e-16,-4.270765e-16,-8.168780e-17,-8.408516e-17,7.169426e-17,...,0.034329,-0.019947,0.031885,0.107254,0.026175,-0.049850,0.053438,0.014593,-0.030902,-0.086152
/c/de/schieben,-9.004515e-17,-1.810347e-07,1.314632e-06,-7.960571e-07,7.706738e-17,1.748083e-17,-9.435811e-17,4.441733e-17,1.606452e-16,-6.052502e-17,...,0.033939,0.017956,0.041299,0.156516,0.023907,-0.011715,0.053101,-0.040555,-0.059441,-0.095432
/c/enm/teme,7.016783e-17,1.623994e-06,-1.047271e-06,1.852308e-06,-8.741855e-16,6.423836e-16,7.569734e-16,7.656732e-16,-9.475655e-17,3.190124e-16,...,0.027047,-0.016603,0.017813,0.080133,0.026272,-0.048739,0.054882,-0.035379,-0.055348,-0.022302
/c/pt/fragmento,4.829102e-17,5.132485e-05,-1.301653e-05,2.417272e-05,-1.563875e-14,9.338159e-15,1.298411e-14,6.888228e-15,-1.575646e-15,1.279317e-15,...,0.036802,-0.022231,0.040601,-0.029773,0.052640,-0.071455,0.068558,-0.028323,-0.127860,0.118185


In [None]:
result.to_csv('test_propagate.csv', index=False)