In [None]:
from exprel.dataset.semeval_dataset import SemevalDataset
from dotenv import load_dotenv 
load_dotenv()

In [None]:
data = SemevalDataset("/home/kovacs/projects/exp-relation-extraction/data/semeval_train.txt")

In [None]:
df = data.to_dataframe()

In [None]:
len(df)

In [None]:
df.groupby("label").size()

In [None]:
from exprel.feature_extractor.extract import FeatureExtractor

extractor = FeatureExtractor()



In [None]:
one_versus_rest = data.one_versus_rest(df, "Entity-Destination(e1,e2)")

In [None]:
#one_versus_rest = one_versus_rest.sample(frac=0.2, random_state=1234).sample(frac=1.0, random_state=1234)

In [None]:
one_versus_rest

In [None]:
ids = one_versus_rest.sen_id.tolist()
sentences = one_versus_rest.sentence.tolist()
labels = one_versus_rest.one_versus_rest.tolist()

In [None]:
labels.count(1)

In [None]:
labels.count(0)

In [None]:
import pickle
PIK = "pickle.dat"

with open(PIK, "rb") as f:
    graphs = pickle.load(f)

In [None]:
from tqdm import tqdm
graphs = []

for sen in tqdm(sentences):
    output, root = extractor.extract(sen)
    graphs.append(output)
    
import pickle
PIK = "pickle.dat"

with open(PIK, "wb") as f:
    pickle.dump(graphs, f)

In [None]:
len(graphs)

In [None]:
from exprel.models.model import GraphModel
model = GraphModel()

In [None]:
from tqdm import tqdm

for id, graph, label in tqdm(zip(ids, graphs, labels)):
    model.featurize_sen_graph(id, graph, label, 3)

In [None]:
model.select_n_best(7000)

In [None]:
X, Y = model.get_x_y(one_versus_rest.one_versus_rest.tolist())

In [None]:
len(X[0])

In [None]:
from sklearn.model_selection import train_test_split as split

tr_data,tst_data,tr_labels,tst_labels = split(X,Y, test_size=0.3, random_state=1234)

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(tr_data, tr_labels)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

lr_pred = clf.predict(tst_data)
for pcf in precision_recall_fscore_support(tst_labels, lr_pred, average=None):
    print(pcf[1])

In [None]:
list(tr_labels).count(1)

In [None]:
list(tst_labels).count(1)

In [None]:
list(lr_pred).count(1)

In [None]:
import eli5

In [None]:
weights_df = eli5.explain_weights_df(clf)

In [None]:
weights_df

In [None]:
feature_graphs = model.get_feature_graphs()

In [None]:
model.inverse_relabel[55]

In [None]:
from graphviz import Digraph
from graphviz import Source
# Create Digraph object
dot = to_dot(feature_graphs[model.inverse_relabel[734]])

In [None]:
Source(dot)

In [None]:
import re

def d_clean(string):
    s = string
    for c in '\\=@-,\'".!:;<>/{}[]()#^?':
        s = s.replace(c, '_')
    s = s.replace('$', '_dollars')
    s = s.replace('%', '_percent')
    s = s.replace('|', ' ')
    s = s.replace('*', ' ')
    if s == '#':
        s = '_number'
    keywords = ("graph", "node", "strict", "edge")
    if re.match('^[0-9]', s) or s in keywords:
        s = "X" + s
    return s

def to_dot(graph, marked_nodes=set(), integ=False):
    lines = [u'## First\ndigraph finite_state_machine {', '\tdpi=70;']
    # lines.append('\tordering=out;')
    # sorting everything to make the process deterministic
    node_lines = []
    node_to_name = {}
    for node, n_data in graph.nodes(data=True):
        if integ:
            d_node = d_clean(str(node))
        else:    
            d_node = d_clean(n_data["name"])
        printname = d_node
        node_to_name[node] = printname
        if 'expanded' in n_data and n_data['expanded'] and printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style=filled, fillcolor=purple];'.format(
                d_node, printname).replace('-', '_')
        elif 'expanded' in n_data and n_data['expanded']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif 'fourlang' in n_data and n_data['fourlang']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled", fillcolor=red];'.format(
                d_node, printname).replace('-', '_')
        elif 'substituted' in n_data and n_data['substituted']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", style=filled, fillcolor=lightblue];'.format(
                d_node, printname).replace('-', '_')
        else:
            node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                d_node, printname).replace('-', '_')
        node_lines.append(node_line)
    lines += sorted(node_lines)

    edge_lines = []
    for u, v, edata in graph.edges(data=True):
        if 'color' in edata:
            d_node1 = node_to_name[u]
            d_node2 = node_to_name[v]
            edge_lines.append(
                u'\t{0} -> {1} [ label = "{2}" ];'.format(d_node1, d_node2, edata['color']))

    lines += sorted(edge_lines)
    lines.append('}')
    return u'\n'.join(lines)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
chi2_selector = SelectKBest(chi2, k=10)
X_kbest = chi2_selector.fit_transform(X, Y)

In [None]:
chi2_selector.get_support(indices=True)

In [None]:
for i, g in enumerate(feature_graphs):
    if len(g.edges()) != 1:
        print(len(g.edges()))

In [None]:
dot = to_dot(feature_graphs[2450])
Source(dot)

In [None]:
dot = to_dot(graphs[0])
Source(dot)

In [None]:
tests = []

for l in model.lexgraphs.gen_lex_subgraphs(graphs[0], 1):
    print(l)
    tests.append(l[1])

In [None]:
dot = to_dot(tests[3])
Source(dot)

In [None]:
from networkx.convert import from_dict_of_dicts as fdd
from networkx.convert import to_dict_of_dicts as tdd

H_dict = tdd(graphs[0])
H_dict

In [None]:
import networkx as nx
G = nx.MultiDiGraph()

fdd(H_dict,create_using=nx.MultiDiGraph()).edges(data=True)

In [None]:
dot = to_dot(graphs[0], integ=True)
Source(dot)

In [None]:
def gen_subgraphs(M, no_edges):
    #print(no_edges)
    """M must be dict of dicts, see networkx.convert.to_dict_of_dicts.
    Generates dicts of dicts, use networkx.convert.from_dict_of_dicts"""
    if no_edges == 0:
        yield from ({v: {}} for v in M)
        return
    for s_graph in gen_subgraphs(M, no_edges-1):
        if no_edges >= 2:
            yield s_graph
        # print('sgraph:', s_graph)
        for node in M:
            #print(node)
            #print(M[node].items())
            for neighbor, edge in M[node].items():
                #print(neighbor)
                #print(edge)
                if node in s_graph and neighbor in s_graph[node]:
                    continue
                if node not in s_graph and neighbor not in s_graph:
                    continue

                new_graph = s_graph.copy()
                if node not in new_graph:
                    new_graph[node] = {neighbor: edge}
                else:
                    new_graph[node][neighbor] = edge
                yield new_graph

In [None]:
for i in gen_subgraphs(H_dict, 1):
    #pass
    print(i)
    sgraph = fdd(i, create_using=nx.MultiDiGraph())
    print(sgraph.edges(data=True))