In [None]:
import pandas as pd
import os
import numpy as np
import time

lang_dict = {
    'english': 'en'
}

In [None]:
'''Source: https://github.com/michaal94/ConceptNet-offline-API'''

class ConceptNet():
    def __init__(self, data_path, language=None, save_language=False):
        # Check extenstion and load data to pandas dataframe
        base, extention = os.path.splitext(data_path)
        if extention == '.csv':
            df = pd.read_csv(data_path, sep='\t', header=None,
                             names=['URI', 'relation', 'start', 'end', 'JSON'])
        elif extention == '.pkl':
            df = pd.read_pickle(data_path)

        # strip non-relevant languages (a lot of free memory)
        if language is not None:
            if not isinstance(language, list):
                language = [language]
            lang_abbr = ['/' + get_language_abbr(l) + '/' for l in language]
            query = '|'.join(lang_abbr)
            index = df[~df.start.str.contains(query)].index
            df.drop(index, inplace=True)
            index = df[~df.end.str.contains(query)].index
            df.drop(index, inplace=True)
            # save if needed for further use
            if save_language:
                df.to_pickle(base + '_' + language[0] + '.pkl')

        self.df = df

    # Functions to query relevant fields
    def get_edges_from_start(self, start_tokens, dataframe=None):
        start_tokens = self.process_tokens(start_tokens)
        if dataframe is None:
            edges = self.df[self.df.start.str.contains('|'.join(start_tokens))]
        else:
            edges = dataframe[dataframe.start.str.contains('|'.join(start_tokens))]
        return edges

    def get_edges_to_end(self, end_tokens, dataframe=None):
        end_tokens = self.process_tokens(end_tokens)
        if dataframe is None:
            edges = self.df[self.df.end.str.contains('|'.join(end_tokens))]
        else:
            edges = dataframe[dataframe.end.str.contains('|'.join(end_tokens))]
        return edges

    def get_edges_by_relation(self, relation_tokens, dataframe=None):
        relation_tokens = self.process_tokens(relation_tokens, relation=True)
        if dataframe is None:
            edges = self.df[self.df.relation.str.contains('|'.join(relation_tokens))]
        else:
            edges = dataframe[dataframe.relation.str.contains('|'.join(relation_tokens))]
        return edges

    # Full query for all possible fields
    def get_query(self, start=None, end=None, relation=None, timing=False):
        if timing:
            start_time = time.time()
        edges = self.df
        if start is not None:
            # if not isinstance(start, list):
            #     start = [start]
            edges = self.get_edges_from_start(start, dataframe=edges)
        if end is not None:
            # if not isinstance(end, list):
            #     end = [end]
            edges = self.get_edges_to_end(end, dataframe=edges)
        if relation is not None:
            # if not isinstance(end, list):
            #     end = [end]
            edges = self.get_edges_by_relation(relation, dataframe=edges)
        # make a copy of small portion of data
        # you can then work on and change small queries without changing main
        edges = edges.copy()
        # reset indices - mainly because it looks much nicer
        edges.reset_index(drop=True, inplace=True)
        if timing:
            time_passed = time.time() - start_time
            print("Query returned in %.4f" % time_passed)
        return EdgeFrame(edges)

    def process_tokens(self, token_list, relation=False):
        if not isinstance(token_list, list):
            token_list = [token_list]
        processed_list = []
        for token in token_list:
            new_token = token
            # lower case as the concept net is
            if not relation:
                new_token = token.lower().replace(' ', '_')
                # Put regex such that word starts with / (like /c/en/word)
                # and ends up with / or nothing - in order to match exact words
                # Basically mach the exact word after two preceeding symbols
                # beginning with /
                new_token = ('^\\/[^\\/]*\\/[^\\/]*\\/' + new_token +
                             '\\/|^\\/[^\\/]*\\/[^\\/]*\\/' + new_token + '$')
            else:
                new_token = ('^\\/[^\\/]*\\/' + new_token +
                             '\\/|^\\/[^\\/]*\\/' + new_token + '$')
            processed_list.append(new_token)
        # print(processed_list)
        return processed_list

    def __len__(self):
        return len(self.df)


class EdgeFrame(ConceptNet):
    def __init__(self, dataframe):
        self.df = dataframe

    def get_raw_dataframe(self):
        return self.df

    def process_data(self):
        self.processed_df = self.df.copy()
        self.processed_df = self.processed_df.reindex(columns=(list(self.processed_df.columns.values) + ['startPoS', 'endPoS', 'startHypernym', 'endHypernym', 'startSurface', 'endSurface', 'surfaceText', 'weight']))
        # Deal with empty query case
        if len(self.processed_df) != 0:
            self.processed_df[['start', 'startPoS', 'startHypernym']] = self.processed_df[['start', 'startPoS', 'startHypernym']].apply(process_node_tokens, axis=1)
            self.processed_df[['end', 'endPoS', 'endHypernym']] = self.processed_df[['end', 'endPoS', 'endHypernym']].apply(process_node_tokens, axis=1)
            self.processed_df[['startSurface', 'endSurface', 'surfaceText', 'weight']] = self.processed_df[['JSON']].apply(process_JSON, axis=1)
            self.processed_df['relation'] = self.processed_df['relation'].map(process_relation)
        self.processed_df.drop(columns=['URI', 'JSON'], inplace=True)


def process_node_tokens(cols):
    # Check for URL and leave it as it is if exists in node
    if 'http' in cols[0]:
        split = cols[0]
    else:
        # Strip leading '/' and split by '/'s
        split = cols[0].strip('/').split('/')[2:]
    if not isinstance(split, list):
        split = [split]
    name, pos, hypernym = np.nan, np.nan, np.nan
    # Extract PoS tag or 'family' word based on how nodes are constructed
    if len(split) > 2:
        if split[2] in ['wp', 'wn']:
            hypernym = split[3]
            split = split[:-2]
    if len(split) > 1:
        pos = split[1]
    name = split[0]
    return pd.Series([name, pos, hypernym])


def process_relation(relation):
    # Relation is much easier cause of 2 possibilities
    split = relation.strip('/').split('/')
    if len(split) > 2:
        return split[2]
    else:
        return split[1]


def process_JSON(json_col):
    # Extract JSON data from database
    # Eval to go from str to dict
    json_col = eval(json_col.values[0])
    startSurface, endSurface, surfaceText, weight = np.nan, np.nan, np.nan, np.nan
    if 'surfaceStart' in json_col:
        startSurface = json_col['surfaceStart']
    if 'surfaceEnd' in json_col:
        endSurface = json_col['surfaceEnd']
    if 'surfaceText' in json_col:
        surfaceText = json_col['surfaceText']
    if 'weight' in json_col:
        weight = json_col['weight']

    series = [startSurface, endSurface, surfaceText, weight]

    return pd.Series(series)


def get_language_abbr(language):
    if language in lang_dict:
        return lang_dict[language]
    else:
        raise NotImplementedError('Language not implemented or not present')


assertion_dir = "/home/jack/Desktop/NN/clean/datasets/conceptnet/"
conceptnet = ConceptNet(assertion_dir+'assertions.csv', language='english', save_language=True)
conceptnet_pkl = ConceptNet(assertion_dir+'assertions_english.pkl')
query_result = conceptnet.get_query(start=['start'], end=['end1', 'end2'], relation='relation')


In [None]:
def get_antonym(start, end = [], relation =  "Antonym"):
    start_end = conceptnet.get_query(start=start, end=end, relation=relation)
    #print('Raw form of data:')
    #print(start_end.get_raw_dataframe().head())
    start_end.process_data()
    #print(start_end.processed_df.to_string())
    #print('You can call query on query, so that just by calling results on the previous answer, you obtain:')
    #print('You are now querying the database with only %d entries' % len(start_end))
    full_query = start_end.get_query(relation=relation, timing=True)
    full_query.process_data()
    #print(full_query.processed_df.to_string())
    #print(".. ", full_query.processed_df.groupby(['start','end', 'weight'])['weight'])
    max_weight_index =  None
    max_weight_index = full_query.processed_df.groupby(['start','end', 'weight'])['weight'].max().argmax()
    return full_query.processed_df['endSurface'][max_weight_index]

get_antonym("dog")

In [164]:
import requests

def get_antonyms_from_api(search_term, relation = "/r/Antonym"):
    response = requests.get('http://api.conceptnet.io/query?node=/c/en/<SEARCH_TERM>&rel=<RELATION_TERM>&limit=1000'.replace("<SEARCH_TERM>", search_term).replace("<RELATION_TERM>", relation))
    obj = response.json()
    return [(x["start"]["label"], x["weight"]) for x in obj["edges"]]



def select_max_weight_antonym(edge_list):
    if len(edge_list)>0:
        index_of_max = max([(x[1],index) for index,x in enumerate(edge_list)])[1]
        return edge_list[index_of_max][0]
    else:
        return None

def get_antonym(search_term, fallback = False):
    antonym_resp = get_antonyms_from_api(search_term)
    antonyms = select_max_weight_antonym(antonym_resp)
    if antonyms != None:
        return antonyms

    # Fallback
    if fallback:
        derived_resp = get_antonyms_from_api(search_term, "/r/DerivedFrom")
        derived = select_max_weight_antonym(derived_resp)
        if derived != None:
            return derived

    return None




In [150]:
print(get_antonym("said"))

resaid


In [80]:
def read_file(path):
    with open(path) as fp:
        lines = fp.read().splitlines()
    return lines

def get_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

[]


In [172]:
''' Load Atts '''
data_dir = "/home/jack/Desktop/NN/clean/datasets/yelp"

ref0_own_Att = read_file(data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/reference_0.txt") # Reference data for delete_generate model
ref1_own_Att = read_file(data_dir+"/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/reference_1.txt") # Reference data for delete_generate model

ref_out_path = data_dir + "/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/"

print(get_attrs(ref0_own_Att[1]))

ref0_att_antonyms = []
ref1_att_antonyms = []

for atts in ref0_own_Att:
    att_antonyms = []
    for word in get_attrs(atts):
        antonym = get_antonym(word, True)
        if antonym != None:
            att_antonyms.append(antonym)
    ref0_att_antonyms.append(att_antonyms)

for atts in ref1_own_Att:
    att_antonyms = []
    for word in get_attrs(atts):
        antonym = get_antonym(word, True)
        if antonym != None:
            att_antonyms.append(antonym)
    ref1_att_antonyms.append(att_antonyms)


print(ref0_att_antonyms[:10])
print(ref1_att_antonyms[:10])

['definitely', 'venue']
[['better', 'better'], ['approximately', 'multivenue'], ['untasted', 'overwatered'], ['resaid'], ['ca', 'nonpharmacy'], ['right', 'tookest'], ['wonderful', 'much'], ['approximately', 'glad'], ['owner', 'heardest'], ['enough']]
[['large', 'destroy'], ['enjoyingly', 'mediocre'], ['affordable', 'evil'], ['sandwichy', 'preloved'], ['unsigned', 'fell'], ['approximately', 'recommend'], ['highly', 'recommend'], ['otherwise', 'mediocre'], ['evil', 'evil'], ['soot oh', 'popular']]


In [171]:
with open(ref_out_path+"reference_conceptnet_0.txt", 'w') as outfile:
    for index,words in enumerate(ref0_att_antonyms):
        if len(words) > 0:
            outfile.write("<ATTR_WORDS> "+ " ".join(words)+ " <CON_START>" +ref0_own_Att[index].split("<CON_START>")[1] +"\n")
        else: 
            outfile.write("<ATTR_WORDS> "+ "<CON_START>" +ref0_own_Att[index].split("<CON_START>")[1] +"\n")


with open(ref_out_path+"reference_conceptnet_1.txt", 'w') as outfile:
    for index,words in enumerate(ref1_att_antonyms):
        if len(words) > 0:
            outfile.write("<ATTR_WORDS> "+ " ".join(words)+ " <CON_START>" +ref1_own_Att[index].split("<CON_START>")[1] +"\n")
        else: 
            outfile.write("<ATTR_WORDS> " +"<CON_START>" +ref1_own_Att[index].split("<CON_START>")[1] +"\n")
        