In [None]:
from exprel.dataset.semeval_dataset import SemevalDataset
import exprel.utils
from dotenv import load_dotenv 
load_dotenv()

In [None]:
data = SemevalDataset("/home/kovacs/projects/exp-relation-extraction/data/semeval_train.txt")

In [None]:
data.load_graphs("pickle.dat")

In [None]:
df = data.to_dataframe()

In [None]:
df

In [None]:
one_versus_rest = data.one_versus_rest(df, "Entity-Destination(e1,e2)")

In [None]:
one_versus_rest.groupby("label").size()

In [None]:
labels = one_versus_rest.groupby("label").size().index.tolist()

In [None]:
import pandas as pd

ids = pd.to_numeric(one_versus_rest.sen_id).tolist()
sentences = one_versus_rest.sentence.tolist()
labels = one_versus_rest.label_id.tolist()
postprocessed_graphs = one_versus_rest.graph.tolist()

In [None]:
from exprel.feature_extractor.extract import FeatureExtractor
from exprel.models.model import GraphModel

extractor = FeatureExtractor()
model = GraphModel()

In [None]:
from tqdm import tqdm

for ind, graph, label in tqdm(zip(ids, postprocessed_graphs, labels)):
    model.featurize_sen_graph(ind, graph, label, 3)

In [None]:
model.vocab_size

In [None]:
feature_graphs = model.get_feature_graphs()

In [None]:
model.select_n_best_from_each_class(400, feature_graphs)

In [None]:
one_versus_rest.label

In [None]:
X, Y = model.get_x_y(one_versus_rest.label.tolist())

In [None]:
from sklearn.model_selection import train_test_split as split

tr_data,tst_data,tr_labels,tst_labels = split(X,Y, test_size=0.3, random_state=1234)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(tr_data, tr_labels)
#clf = DecisionTreeClassifier(random_state=0, max_depth=4).fit(tr_data, tr_labels)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
keys = model.label_vocab.word_to_id.keys()
labels_to_result = {}
lr_pred = clf.predict(tst_data)
prf = precision_recall_fscore_support(tst_labels, lr_pred, average=None)
s = classification_report(tst_labels, lr_pred, target_names=keys, output_dict=True)

In [None]:
s

In [None]:
"""
import numpy as np

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print(indices)
"""

In [None]:
"""
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))

from sklearn.tree import plot_tree
t = plot_tree(clf, filled=True, fontsize=14)
"""

In [None]:
import eli5

In [None]:
weights_df = eli5.explain_weights_df(clf)

In [None]:
weights_df.target.unique()

In [None]:
feature_graph_strings = model.get_feature_graph_strings()

In [None]:
from collections import defaultdict
features = defaultdict(list)

for target in weights_df.target.unique():
    targeted_df = weights_df[weights_df.target == target]
    most_important_weights = targeted_df.iloc[:5].feature.str.strip("x").tolist()
    for i in most_important_weights:
        if i != "<BIAS>":
            g_nx = feature_graphs[model.inverse_relabel[int(i)]]
            if len(g_nx.edges()):
                g = feature_graph_strings[model.inverse_relabel[int(i)]]
                features[list(keys)[int(target)]].append((g, model.label_vocab.id_to_word[int(target)]))

In [None]:
import json

with open("features.json", "w+") as f:
    json.dump(features, f)

In [None]:
_, val = split(one_versus_rest, test_size=0.3, random_state=1234) 

In [None]:
model.label_vocab.id_to_word[int(target)]

In [None]:
val_graphs = val.graph.tolist()
val_labels = val.one_versus_rest.tolist()

In [None]:
val_df = val.copy()

In [None]:
del val_df["one_versus_rest"]

In [None]:
val_df.to_pickle("validation_dataset")

In [None]:
val_df

In [None]:
from sklearn.metrics import precision_recall_fscore_support

measure_features = []

for feat in features:
    measure = [feat[0]]
    extractor.set_matcher([feat])
    false_pos = []
    val_predicted = []
    for i, g in enumerate(val_graphs):
        feats = extractor.matcher.match(g)
        label = 0
        for feat in feats:
            label = feat
        if label == 1 and val_labels[i] == 0:
            false_pos.append(g)
        val_predicted.append(label)
    for pcf in precision_recall_fscore_support(val_labels, val_predicted, average=None):
        measure.append(pcf[1])
    measure.append(false_pos)
    
    measure_features.append(measure)

In [None]:
import pandas as pd

df = pd.DataFrame(measure_features, columns = ['Feature', 'Precision', 'Recall', "Fscore", "Support", "False_positives"]) 

In [None]:
df

In [None]:
df.to_pickle("rules_examine")

In [None]:
from graphviz import Source
Source(to_dot(val_graphs[1537]))

In [None]:
val.iloc[1537]

In [None]:
import networkx as nx
all_features = nx.MultiDiGraph()

In [None]:
for i, t in enumerate(weights_df.iloc[:10].feature.str.strip("x").tolist()):
    node_to_string = {}
    g = feature_graphs[model.inverse_relabel[int(t)]]
    nodes = []
    for n in g.nodes(data=True):
        n_post = n[1]["name"]+"_feature"+str(i)
        nodes.append(n_post)
        node_to_string[n[0]] = n_post
    all_features.add_nodes_from(nodes)
    
    for e in g.edges(data=True):
        all_features.add_edge(node_to_string[e[0]], node_to_string[e[1]], color=e[2]["color"])
        

In [None]:
from graphviz import Digraph
from graphviz import Source
# Create Digraph object
dot = to_dot(all_features, integ=True)
Source(dot)

In [None]:
from graphviz import Digraph
from graphviz import Source
# Create Digraph object
dot = to_dot(feature_graphs[model.inverse_relabel[42]])

In [None]:
dots = []

for t in weights_df.iloc[:10].feature.str.strip("x").tolist():
    dots.append(feature_graphs[model.inverse_relabel[int(t)]])

Source(to_dots(dots))

In [None]:
import re

def d_clean(string):
    s = string
    for c in '\\=@-,\'".!:;<>/{}[]()#^?':
        s = s.replace(c, '_')
    s = s.replace('$', '_dollars')
    s = s.replace('%', '_percent')
    s = s.replace('|', ' ')
    s = s.replace('*', ' ')
    if s == '#':
        s = '_number'
    keywords = ("graph", "node", "strict", "edge")
    if re.match('^[0-9]', s) or s in keywords:
        s = "X" + s
    return s

def to_dots(graphs, marked_nodes=set(), integ=False):
    lines = [u'digraph finite_state_machine {', '\tdpi=70;']
    # lines.append('\tordering=out;')
    # sorting everything to make the process deterministic
    for i, graph in enumerate(graphs):
        s = "subgraph cluster_" + chr(ord('@')+i+1) + " {"
        node_lines = []

        node_lines.append(s)
        node_to_name = {}
        for node, n_data in graph.nodes(data=True):
            if integ:
                d_node = d_clean(str(node))
            else:    
                d_node = d_clean(n_data["name"])
            printname = d_node
            node_to_name[node] = printname
            if 'expanded' in n_data and n_data['expanded'] and printname in marked_nodes:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style=filled, fillcolor=purple];'.format(
                    d_node, printname).replace('-', '_')
            elif 'expanded' in n_data and n_data['expanded']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled"];'.format(
                    d_node, printname).replace('-', '_')
            elif 'fourlang' in n_data and n_data['fourlang']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled", fillcolor=red];'.format(
                    d_node, printname).replace('-', '_')
            elif 'substituted' in n_data and n_data['substituted']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled"];'.format(
                    d_node, printname).replace('-', '_')
            elif printname in marked_nodes:
                node_line = u'\t{0} [shape = circle, label = "{1}", style=filled, fillcolor=lightblue];'.format(
                    d_node, printname).replace('-', '_')
            else:
                node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                    d_node, printname).replace('-', '_')
            node_lines.append(node_line)
        lines += sorted(node_lines)

        edge_lines = []
        for u, v, edata in graph.edges(data=True):
            if 'color' in edata:
                d_node1 = node_to_name[u]
                d_node2 = node_to_name[v]
                edge_lines.append(
                    u'\t{0} -> {1} [ label = "{2}" ];'.format(d_node1, d_node2, edata['color']))

        lines += sorted(edge_lines)
        lines.append('}')
    lines.append('}')
    return u'\n'.join(lines)

def to_dot(graph, marked_nodes=set(), integ=False):
    lines = [u'digraph finite_state_machine {', '\tdpi=70;']
    # lines.append('\tordering=out;')
    # sorting everything to make the process deterministic
    node_lines = []
    node_to_name = {}
    for node, n_data in graph.nodes(data=True):
        if integ:
            d_node = d_clean(str(node))
        else:    
            d_node = d_clean(n_data["name"])
        printname = d_node
        node_to_name[node] = printname
        if 'expanded' in n_data and n_data['expanded'] and printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style=filled, fillcolor=purple];'.format(
                d_node, printname).replace('-', '_')
        elif 'expanded' in n_data and n_data['expanded']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif 'fourlang' in n_data and n_data['fourlang']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled", fillcolor=red];'.format(
                d_node, printname).replace('-', '_')
        elif 'substituted' in n_data and n_data['substituted']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", style=filled, fillcolor=lightblue];'.format(
                d_node, printname).replace('-', '_')
        else:
            node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                d_node, printname).replace('-', '_')
        node_lines.append(node_line)
    lines += sorted(node_lines)

    edge_lines = []
    for u, v, edata in graph.edges(data=True):
        if 'color' in edata:
            d_node1 = node_to_name[u]
            d_node2 = node_to_name[v]
            edge_lines.append(
                u'\t{0} -> {1} [ label = "{2}" ];'.format(d_node1, d_node2, edata['color']))

    lines += sorted(edge_lines)
    lines.append('}')
    return u'\n'.join(lines)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
chi2_selector = SelectKBest(chi2, k=10)
X_kbest = chi2_selector.fit_transform(X, Y)

In [None]:
chi2_selector.get_support(indices=True)

In [None]:
for i, g in enumerate(feature_graphs):
    if len(g.edges()) != 1:
        print(len(g.edges()))

In [None]:
dot = to_dot(feature_graphs[2450])
Source(dot)

In [None]:
dot = to_dot(graphs[0])
Source(dot)

In [None]:
tests = []

for l in model.lexgraphs.gen_lex_subgraphs(graphs[0], 1):
    print(l)
    tests.append(l[1])

In [None]:
dot = to_dot(tests[3])
Source(dot)

In [None]:
from networkx.convert import from_dict_of_dicts as fdd
from networkx.convert import to_dict_of_dicts as tdd

H_dict = tdd(graphs[0])
H_dict

In [None]:
import pickle
with open("pickle.dat", "rb") as f:
    graphs = pickle.load(f)

In [None]:
for n in graphs[0].nodes(data=True):
    print(n)

In [None]:
import networkx as nx
G = nx.MultiDiGraph()

C = fdd(H_dict,create_using=nx.MultiDiGraph())

In [None]:
from graphviz import Digraph
from graphviz import Source

outdeg = graphs[7].degree()
print(outdeg)

to_keep = [n for (n, deg) in outdeg if deg != 0]
G = graphs[7].subgraph(to_keep)
dot = to_dot(G, integ=False)
Source(dot)

In [None]:
import networkx as nx


def gen_subgraphs(M, no_edges):
    """M must be dict of dicts, see networkx.convert.to_dict_of_dicts.
    Generates dicts of dicts, use networkx.convert.from_dict_of_dicts"""
    if no_edges == 0:
        yield from ({v: {}} for v in M)
        return
    for s_graph in gen_subgraphs(M, no_edges-1):
        yield s_graph
        # print('sgraph:', s_graph)
        for node in M:
            for neighbor, edge in M[node].items():
                if node in s_graph and neighbor in s_graph[node]:
                    continue
                if node not in s_graph and neighbor not in s_graph:
                    continue

                new_graph = s_graph.copy()
                if node not in new_graph:
                    new_graph[node] = {neighbor: edge}
                else:
                    new_graph[node][neighbor] = edge
                yield new_graph

In [None]:
sgraphs = []
for l in model.lexgraphs.gen_lex_subgraphs(G, 1):
    if l[1].edges():
        sgraphs.append(to_dot(l[1]))

In [None]:
len(sgraphs)

In [None]:
Source(sgraphs[7])

In [None]:
from collections import Counter

s = Counter()

s["banana"] += 1

In [None]:
s.most_common(1)