In [None]:
import re

def d_clean(string):
    s = string
    for c in '\\=@-,\'".!:;<>/{}[]()#^?':
        s = s.replace(c, '_')
    s = s.replace('$', '_dollars')
    s = s.replace('%', '_percent')
    s = s.replace('|', ' ')
    s = s.replace('*', ' ')
    if s == '#':
        s = '_number'
    keywords = ("graph", "node", "strict", "edge")
    if re.match('^[0-9]', s) or s in keywords:
        s = "X" + s
        
    if not s:
        return "None"
    return s

def to_dots(graphs, marked_nodes=set(), integ=False):
    lines = [u'digraph finite_state_machine {', '\tdpi=70;']
    # lines.append('\tordering=out;')
    # sorting everything to make the process deterministic
    for i, graph in enumerate(graphs):
        s = "subgraph cluster_" + chr(ord('@')+i+1) + " {"
        node_lines = []

        node_lines.append(s)
        node_to_name = {}
        for node, n_data in graph.nodes(data=True):
            if integ:
                d_node = d_clean(str(node))
            else:    
                d_node = d_clean(n_data["name"])
            printname = d_node
            node_to_name[node] = printname
            if 'expanded' in n_data and n_data['expanded'] and printname in marked_nodes:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style=filled, fillcolor=purple];'.format(
                    d_node, printname).replace('-', '_')
            elif 'expanded' in n_data and n_data['expanded']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled"];'.format(
                    d_node, printname).replace('-', '_')
            elif 'fourlang' in n_data and n_data['fourlang']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled", fillcolor=red];'.format(
                    d_node, printname).replace('-', '_')
            elif 'substituted' in n_data and n_data['substituted']:
                node_line = u'\t{0} [shape = circle, label = "{1}", \
                        style="filled"];'.format(
                    d_node, printname).replace('-', '_')
            elif printname in marked_nodes:
                node_line = u'\t{0} [shape = circle, label = "{1}", style=filled, fillcolor=lightblue];'.format(
                    d_node, printname).replace('-', '_')
            else:
                node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                    d_node, printname).replace('-', '_')
            node_lines.append(node_line)
        lines += sorted(node_lines)

        edge_lines = []
        for u, v, edata in graph.edges(data=True):
            if 'color' in edata:
                d_node1 = node_to_name[u]
                d_node2 = node_to_name[v]
                edge_lines.append(
                    u'\t{0} -> {1} [ label = "{2}" ];'.format(d_node1, d_node2, edata['color']))

        lines += sorted(edge_lines)
        lines.append('}')
    lines.append('}')
    return u'\n'.join(lines)

def to_dot(graph, marked_nodes=set(), integ=False):
    lines = [u'digraph finite_state_machine {', '\tdpi=70;']
    # lines.append('\tordering=out;')
    # sorting everything to make the process deterministic
    node_lines = []
    node_to_name = {}
    for node, n_data in graph.nodes(data=True):
        if integ:
            d_node = d_clean(str(node))
        else:    
            d_node = d_clean(n_data["name"])
        printname = d_node
        node_to_name[node] = printname
        if 'expanded' in n_data and n_data['expanded'] and printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style=filled, fillcolor=purple];'.format(
                d_node, printname).replace('-', '_')
        elif 'expanded' in n_data and n_data['expanded']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif 'fourlang' in n_data and n_data['fourlang']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled", fillcolor=red];'.format(
                d_node, printname).replace('-', '_')
        elif 'substituted' in n_data and n_data['substituted']:
            node_line = u'\t{0} [shape = circle, label = "{1}", \
                    style="filled"];'.format(
                d_node, printname).replace('-', '_')
        elif printname in marked_nodes:
            node_line = u'\t{0} [shape = circle, label = "{1}", style=filled, fillcolor=lightblue];'.format(
                d_node, printname).replace('-', '_')
        else:
            node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                d_node, printname).replace('-', '_')
        node_lines.append(node_line)
    lines += sorted(node_lines)

    edge_lines = []
    for u, v, edata in graph.edges(data=True):
        if 'color' in edata:
            d_node1 = node_to_name[u]
            d_node2 = node_to_name[v]
            edge_lines.append(
                u'\t{0} -> {1} [ label = "{2}" ];'.format(d_node1, d_node2, edata['color']))

    lines += sorted(edge_lines)
    lines.append('}')
    return u'\n'.join(lines)

In [None]:
from exprel.dataset.hasoc_dataset import HasocDataset
from exprel.models.utils import tree_to_code
from dotenv import load_dotenv 
import pandas as pd
load_dotenv()

In [None]:
df_train = pd.read_csv("/home/kovacs/projects/exp-relation-extraction/data/hasoc_2021_train_normalized.csv", delimiter="\t")
df_test = pd.read_csv("/home/kovacs/projects/exp-relation-extraction/data/hasoc_2021_test_normalized.csv", delimiter="\t")
train_data = HasocDataset(df_train)
test_data = HasocDataset(df_test)

In [None]:
import pandas as pd
pd.options.display.max_colwidth = 200

In [None]:
from exprel.feature_extractor.extract import FeatureExtractor
from exprel.models.model import GraphModel

extractor = FeatureExtractor(lang="en", cache_fn="en_nlp_cache")
model = GraphModel()
test_model = GraphModel()

In [None]:
train_data.load_graphs("/home/kovacs/projects/exp-relation-extraction/notebooks/graphs/hasoc2021_train_amr.pickle")
test_data.load_graphs("/home/kovacs/projects/exp-relation-extraction/notebooks/graphs/hasoc2021_test_amr.pickle")
#graphs = data.parse_graphs(extractor, format="fourlang")

In [None]:
df_train = train_data.to_dataframe()
df_test = test_data.to_dataframe()

In [None]:
df_train

In [None]:
import pandas as pd

ids = pd.to_numeric(df_train.index).tolist()
sentences = df_train.preprocessed_text.tolist()
labels = df_train.task2_id.tolist()
postprocessed_graphs = df_train.graph.tolist()

In [None]:
from tqdm import tqdm

for ind, graph, label in tqdm(zip(ids, postprocessed_graphs, labels)):
    model.featurize_sen_graph(ind, graph, label, 2)

In [None]:
import pandas as pd

test_ids = pd.to_numeric(df_test.index).tolist()
test_sentences = df_test.preprocessed_text.tolist()
test_labels = df_test.task2_id.tolist()
test_postprocessed_graphs = df_test.graph.tolist()

for ind, graph, label in tqdm(zip(test_ids, test_postprocessed_graphs, test_labels)):
    test_model.featurize_sen_graph(ind, graph, label, 2)

In [None]:
model.vocab_size

In [None]:
feature_graphs = model.get_feature_graphs()
test_feature_graphs = test_model.get_feature_graphs()

In [None]:
model.select_n_best(2500)
test_model.select_n_best(2500)

In [None]:
label_vocab = {"NONE": 0, "PRFN": 1, "OFFN": 2, "HATE": 3}

In [None]:
X, Y = model.get_x_y(df_train.task2, label_vocab=label_vocab)

In [None]:
test_X, _ = test_model.get_x_y(df_test.task2, label_vocab = {None: 0})

In [None]:
from sklearn.model_selection import train_test_split as split

tr_data,tst_data,tr_labels,tst_labels = split(X,Y, test_size=0.2, random_state=1234)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression(random_state=0).fit(tr_data, tr_labels)
clf = OneVsRestClassifier(RandomForestClassifier(random_state=0, class_weight="balanced_subsample")).fit(tr_data, tr_labels)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
keys = ["NONE", "PRFN", "OFFN", "HATE"]
labels_to_result = {}
lr_pred = clf.predict(tst_data)
#prf = precision_recall_fscore_support(tst_labels, lr_pred, average=None)
print(classification_report(tst_labels, lr_pred, target_names=keys, output_dict=False))

In [None]:
feature_graph_strings = model.get_feature_graph_strings()

In [None]:
import eli5

In [None]:
weights_df = eli5.explain_weights_df(clf)

In [None]:
weights_df

In [None]:
list(keys)

In [None]:
from collections import defaultdict
features = defaultdict(list)

for target in weights_df.target.unique():
    targeted_df = weights_df[weights_df.target == target]
    most_important_weights = targeted_df.iloc[:5].feature.str.strip("x").tolist()
    for i in most_important_weights:
        if i != "<BIAS>":
            g_nx = feature_graphs[model.inverse_relabel[int(i)]]
            #if len(g_nx.edges()):
            g = feature_graph_strings[model.inverse_relabel[int(i)]]
            features[list(keys)[int(target)]].append(([g], [], {v: k for k, v in label_vocab.items()}[int(target)]))

In [None]:
#RandomForest
from collections import defaultdict
features = defaultdict(list)

for j, est in enumerate(clf.estimators_):
    weights_df = eli5.explain_weights_df(est)
    most_important_weights = weights_df.iloc[:5].feature.str.strip("x").tolist()
    for i in most_important_weights:
        if i != "<BIAS>":
            g_nx = feature_graphs[model.inverse_relabel[int(i)]]
            #if len(g_nx.edges()):
            g = feature_graph_strings[model.inverse_relabel[int(i)]]
            features[list(keys)[j]].append(([g], [], model.label_vocab.id_to_word[j]))

In [None]:
features

In [None]:
import json

with open("2021_train_features_task2.json", "w+") as f:
    json.dump(features, f)

In [None]:
train, val = split(df_train, test_size=0.2, random_state=1234) 

In [None]:
train

In [None]:
train = train.rename(columns={'preprocessed_text': 'sentence', 'task2': 'label'})
val = val.rename(columns={'preprocessed_text': 'sentence', 'task2': 'label'})

In [None]:
train

In [None]:
train.to_pickle("train_dataset")
val.to_pickle("val_dataset")

In [None]:
df

## Simple Ngram model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=2500, stop_words="english", lowercase=True, ngram_range=(1,3))

In [None]:
X = vectorizer.fit(train.sentence)

In [None]:
X_train = X.transform(train.sentence)
X_val = X.transform(val.sentence)

In [None]:
clf2 = OneVsRestClassifier(RandomForestClassifier(random_state=0, class_weight="balanced_subsample")).fit(X_train, tr_labels)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
keys = ["NONE", "PRFN", "OFFN", "HATE"]
labels_to_result = {}
lr_pred2 = clf2.predict(X_val)
print(classification_report(tst_labels, lr_pred2, target_names=keys, output_dict=False))

In [None]:
ngram = clf2.predict_proba(X_val)

In [None]:
amr = clf.predict_proba(tst_data)

In [None]:
soft_voted = ngram + amr

In [None]:
import numpy as np
preds = np.argmax(soft_voted, axis=1)

In [None]:
print(classification_report(tst_labels, preds, target_names=keys, output_dict=False))

In [None]:
amr_predict = clf.predict(test_X)

In [None]:
ngram_predict = clf2.predict(X.transform(df_test.preprocessed_text))

In [None]:
soft_voted = clf2.predict_proba(X.transform(df_test.preprocessed_text)) + clf.predict_proba(test_X)

In [None]:
import numpy as np
test_preds = np.argmax(soft_voted, axis=1)

In [None]:
inverse_vocab = {v: k for k, v in label_vocab.items()}

In [None]:
test_predictions = pd.DataFrame({"sentence": df_test.preprocessed_text, "graph_pred": [inverse_vocab[i] for i in amr_predict], "ngram_pred": [inverse_vocab[i] for i in ngram_predict], "soft_vote": [inverse_vocab[i] for i in test_preds]})

In [None]:
test_predictions.to_csv("2021_hasoc_test_taskB.csv", sep='\t')

In [None]:
rule_labels = []
with open("2021_rule_labels") as f:
    for line in f:
        rule_labels.append(line.strip("\n"))

In [None]:
val_proba = clf2.predict_proba(X_val)

In [None]:
rule_argmax = []
for i, proba in enumerate(val_proba):
    L = np.argsort(-proba)
    if L[0] == 0 and rule_labels[i] == "HOF":
        p
        rule_argmax.append(L[1])
    else:
        rule_argmax.append(L[0])

In [None]:
print(classification_report(tst_labels, rule_argmax, target_names=keys, output_dict=False))