In [280]:
import sys
sys.path.append('../template_model')
from util import load_train_dev, Entry
import re
from collections import Counter, defaultdict
from template_based2 import abstract_triples

RE_FIND_THIAGO_SLOT = re.compile('((?:AGENT-.)|(?:PATIENT-.)|(?:BRIDGE-.))')

def extract_orders(e):
    
    orders = Counter()
    
    for lexe in e.lexes:
        
        slots = RE_FIND_THIAGO_SLOT.findall(lexe['template'])
        positions = {k: i for i, k in enumerate(slots)}
        positions = defaultdict(lambda: 10000, positions)
        sorted_triples = tuple(sorted(e.triples, key=lambda t: positions[e.r_entity_map[t.object]]))
        abstracted_sorted_triples = abstract_triples(sorted_triples)
        
        orders[abstracted_sorted_triples] += 1
        
    return orders

In [281]:
all_td = load_train_dev()

# removendo entradas sem r_entity_map
td = [t for t in td if t.r_entity_map]

# Formulation (order_1, order_2, [-1, 0, 1])

In [44]:
from itertools import permutations, product, combinations


def extract_features(order, e):
    
    return {'order': order, 
            'category': e.category,
            'n_triples': len(e.triples),
            'is_first_order': order == abstract_triples(e.triples)}

def merge_features(l1_features, l2_features):
    
    features_merged = {'{}_1'.format(k): v for k, v in l1_features.items()}
    features_merged.update({'{}_2'.format(k): v for k, v in l2_features.items()})
    
    return features_merged

def make_entries(e):
    
    data = []
    
    all_orders = permutations(e.triples)
    all_abstracted_orders = set([abstract_triples(ts) for ts in all_orders])
    
    features = {order: extract_features(order, e) for order in all_abstracted_orders}
    
    lexicalized_orders = extract_orders(e)
    
    for l1, l2 in product(lexicalized_orders.keys(), 
                          all_abstracted_orders - lexicalized_orders.keys()):
        
        l1_features = features[l1]
        l2_features = features[l2]
        
        l1_l2_features = merge_features(l1_features, l2_features)
        l2_l1_features = merge_features(l2_features, l1_features)
        
        data.append((l1_l2_features, 1))
        data.append((l2_l1_features, -1))
        
    for l1, l2 in combinations(all_abstracted_orders, 2):
        
        l1_features = features[l1]
        l2_features = features[l2]
        
        l1_l2_features = merge_features(l1_features, l2_features)
        
        data.append((l1_l2_features, 0))
        
    return data



In [45]:
data = []

for t in td[:50]:
    x = make_entries(td[4444])
    data.extend(x)

In [46]:
import pandas as pd

df = pd.DataFrame([c[0] for c in data])
df['target'] = [c[1] for c in data]

df.groupby(['is_first_order_1', 'is_first_order_2', 'target']).size().unstack(fill_value=0)

Unnamed: 0_level_0,target,-1,0,1
is_first_order_1,is_first_order_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,False,1100,12650,1100
False,True,1100,100,0
True,False,0,1050,1100


# Formulation (order, [0, 1])

In [51]:
from itertools import permutations, product, combinations


def extract_features(order, e):
    
    return {'order': order, 
            'category': e.category,
            'n_triples': len(e.triples),
            'is_first_order': order == abstract_triples(e.triples)}

def make_entries(e):
    
    data = []
    
    all_orders = permutations(e.triples)
    all_abstracted_orders = set([abstract_triples(ts) for ts in all_orders])
    
    features = {order: extract_features(order, e) for order in all_abstracted_orders}
    
    lexicalized_orders = extract_orders(e)
    
    for l in lexicalized_orders.keys():
        
        l_features = extract_features(l, e)
        
        data.append((l_features, 1))
    
    for l in all_abstracted_orders - lexicalized_orders.keys():
        
        l_features = extract_features(l, e)
        
        data.append((l_features, 0))
        
    return data

In [76]:
data = []

for t in td[:6000]:
    x = make_entries(t)
    data.extend(x)

In [77]:
df = pd.DataFrame([c[0] for c in data])
df['target'] = [c[1] for c in data]

df.groupby(['n_triples', 'is_first_order', 'target']).size().unstack(fill_value=0)

Unnamed: 0_level_0,target,0,1
n_triples,is_first_order,Unnamed: 2_level_1,Unnamed: 3_level_1
1,True,0,1788
2,False,383,796
2,True,248,960
3,False,4363,1958
3,True,732,618
4,False,24830,2544
4,True,1030,264
5,False,37873,827
5,True,347,13


In [78]:
df.head()

Unnamed: 0,category,is_first_order,n_triples,order,target
0,Airport,True,1,"((slot0, cityServed, slot1),)",1
1,Airport,True,1,"((slot0, cityServed, slot1),)",1
2,Airport,True,1,"((slot0, elevationAboveTheSeaLevel_(in_metres)...",1
3,Airport,True,1,"((slot0, location, slot1),)",1
4,Airport,True,1,"((slot0, operatingOrganisation, slot1),)",1


In [97]:
from sklearn.model_selection import train_test_split

X = df[['category', 'is_first_order', 'n_triples']]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=df['target'])

X_train.shape, X_test.shape

((63659, 3), (15915, 3))

In [101]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

data_pipeline = Pipeline([
    ('ohe', OneHotEncoder())
])

pipeline = Pipeline([
    ('data', data_pipeline),
    ('clf', None)
])

params_grid = {
    'clf': [LogisticRegression(solver='lbfgs'), DecisionTreeClassifier(), RandomForestClassifier()]
}

cv = GridSearchCV(pipeline, params_grid, cv=5, scoring='recall')

cv.fit(X_train, y_train)

cv.best_estimator_



Pipeline(memory=None,
         steps=[('data',
                 Pipeline(memory=None,
                          steps=[('ohe',
                                  OneHotEncoder(categorical_features=None,
                                                categories=None, drop=None,
                                                dtype=<class 'numpy.float64'>,
                                                handle_unknown='error',
                                                n_values=None, sparse=True))],
                          verbose=False)),
                ('clf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                      

In [102]:
cv.best_score_

0.38349306269122846

In [103]:
model = cv.best_estimator_

y_pred = cv.predict(X_train)

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train, y_pred)

pd.DataFrame(cm, index=['not_good', 'good'], columns=['not_choosen', 'choosen'])

Unnamed: 0,not_choosen,choosen
not_good,55283,562
good,4853,2961


# Formulation ({order:{0, 1} for order in orders})

In [282]:
from itertools import permutations, product, combinations


def extract_features(order, e):
    
    return {'category': e.category,
            'n_triples': len(e.triples),
            'is_first_order': order == abstract_triples(e.triples)}

def make_entries(e):
    
    all_orders = permutations(e.triples)
    all_abstracted_orders = set([abstract_triples(ts) for ts in all_orders])
    
    lexicalized_orders = extract_orders(e)
    
    features = {order: extract_features(order, e) for order in all_abstracted_orders}
    
    all_orders_features = []
    ok_orders = set()
    
    for i, order in enumerate(all_abstracted_orders):
        
        all_orders_features.append(features[order])
        
        if order in lexicalized_orders:
            
            ok_orders.add(i)
    
    return all_abstracted_orders, all_orders_features, ok_orders

In [283]:
from sklearn.model_selection import train_test_split

orders = []
X = []
y = []

for e in td:
    all_orders, all_orders_features, ok_orders = make_entries(e)
    orders.append(all_orders)
    X.append(all_orders_features)
    y.append(ok_orders)
    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

len(X_train), len(X_test)

(6248, 1562)

In [289]:
X[3000], y[3000]

([{'category': 'Airport', 'n_triples': 3, 'is_first_order': True},
  {'category': 'Airport', 'n_triples': 3, 'is_first_order': False},
  {'category': 'Airport', 'n_triples': 3, 'is_first_order': False},
  {'category': 'Airport', 'n_triples': 3, 'is_first_order': False},
  {'category': 'Airport', 'n_triples': 3, 'is_first_order': False},
  {'category': 'Airport', 'n_triples': 3, 'is_first_order': False}],
 {3, 5})

In [177]:
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np

class MyModel(BaseEstimator, RegressorMixin):
    
    def __init__(self, clf=None):
        self.clf=clf
    
    def fit(self, X, y):
        
        X_ = []
        y_ = []
        
        for x, y__ in zip(X, y):
            
            for i, x_ in enumerate(x):
                
                X_.append(x_.todense())
                y_.append(1 if i in y__ else 0)
                
        X_ = np.vstack(X_)
        self.clf.fit(X_, y_)
        
        return self
    
    def predict(self, X):
        
        results = []
        
        for x in X:
            pred = self.clf.predict(x)
            result = {i for i in range(len(pred)) if pred[i] == 1}
            results.append(result)
            
        return results

In [247]:
from sklearn.base import BaseEstimator, TransformerMixin

class MyTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, data_pipeline):
        self.data_pipeline = data_pipeline
    
    def fit(self, X, y=None):
        
        X_ = []
        for x in X:
            for x_ in x:
                X_.append(x_)
                
        self.data_pipeline.fit(X_)
        
        return self
    
    def transform(self, X, y=None):
        
        results = []
        for x in X:
            result = self.data_pipeline.transform(x)
            results.append(result)
            
        return results

In [248]:
from sklearn.metrics import make_scorer

def f1_score_order_entry_func(y_, y_pred_):
    
    # se há positivos e não retornou nenhum, f1 = 0
    if len(y_pred_) == 0 and len(y_) != 0:
        return 0
    
    precision = len(y_pred_.intersection(y_)) / len(y_pred_)
    recall = len(y_pred_.intersection(y_)) / len(y_)
    
    # se precision e recall == 0, f1 = 0
    if precision == 0 and recall == 0 and len(y_) != 0:
        return 0
    
    return 2 * (precision * recall) / (precision + recall)

def f1_score_order_func(y, y_pred):
    
    all_f1 = []
    for y_, y_pred_ in zip(y, y_pred):
        f1 = f1_score_order_entry_func(y_, y_pred_)
        all_f1.append(f1)
        
    return np.mean(all_f1)

f1_score_order = make_scorer(f1_score_order_func)

In [264]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('data', MyTransformer(DictVectorizer())),
    ('clf', MyModel())
])

params_grid = {
    'clf__clf': [LogisticRegression(solver='lbfgs'), 
                 DecisionTreeClassifier(), 
                 DummyClassifier(strategy='constant', constant=1)]
}

cv = GridSearchCV(pipeline, params_grid, cv=5, scoring=f1_score_order, return_train_score=True)

cv.fit(X_train, y_train)

cv.best_estimator_

Pipeline(memory=None,
         steps=[('data',
                 MyTransformer(data_pipeline=DictVectorizer(dtype=<class 'numpy.float64'>,
                                                            separator='=',
                                                            sort=True,
                                                            sparse=True))),
                ('clf',
                 MyModel(clf=DummyClassifier(constant=1, random_state=None,
                                             strategy='constant')))],
         verbose=False)

In [265]:
cv.best_score_

0.9297718253968253

In [268]:
cv.cv_results_['mean_test_score']

array([0.92916667, 0.92854167, 0.92977183])