In [None]:
import pandas as pd
import numpy as np
from tools.feature_matrices import parse_feature_matrices
from tools import dataset_tools
from sklearn.linear_model import SGDClassifier, LogisticRegressionCV
from sklearn.preprocessing import normalize
from sklearn.model_selection import GridSearchCV

## Inspect Original Dataset

In [None]:
dataset_path = './benchmarks/FB13/'
corrupted_data_path = '/Users/Alvinho/Documents/benchmarks/FB13/corrupted/train2id_bern_2to1.txt'

In [None]:
entity2id, id2entity = dataset_tools.read_name2id_file(dataset_path + 'entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file(dataset_path + 'relation2id.txt')

In [None]:
true_train = pd.read_csv(dataset_path + 'train2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
true_valid = pd.read_csv(dataset_path + 'valid2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
true_test = pd.read_csv(dataset_path + 'test2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])

valid_neg = pd.read_csv(dataset_path + 'valid2id_neg.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
test_neg = pd.read_csv(dataset_path + 'test2id_neg.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])

data = pd.concat([true_train, true_valid, true_test])

In [None]:
ents = set()
ents.update(data.e1.unique())
ents.update(data.e2.unique())

print('Entities: {}'.format(len(ents)))
print('Relations: {}'.format(len(data.rel.unique())))

print('\nTrain triples: {}'.format(len(true_train)))
print('Valid triples: {}'.format(len(true_valid)))
print('Test triples: {}'.format(len(true_test)))

print('\nAll triples: {}').format(len(data))

In [None]:
def apply_id2relation(x):
    return id2relation[x]

def apply_id2entity(x):
    return id2entity[x]

In [None]:
# Add relations and entities names to dataset
true_test['rel_name'] = true_test['rel'].apply(apply_id2relation)
true_test['head'] = true_test['e1'].apply(apply_id2entity)
true_test['tail'] = true_test['e2'].apply(apply_id2entity)
# Training data
true_train['rel_name'] = true_train['rel'].apply(apply_id2relation)
true_train['head'] = true_train['e1'].apply(apply_id2entity)
true_train['tail'] = true_train['e2'].apply(apply_id2entity)

## Parsing the feature matrixes (tables)

In [None]:
target_relation = 'profession'
data_path = './extract_feat__neg_by_random/' + target_relation
split_data_path = '/Users/Alvinho/Documents/1524490825/pra_explain/splits/bern_2to1'

In [None]:
train_matrix_fpath = data_path + "/train.tsv"
validation_matrix_fpath = data_path + "/validation.tsv"
test_matrix_fpath = data_path + "/test.tsv"
train_data, test_data = parse_feature_matrices(train_matrix_fpath, test_matrix_fpath)

In [None]:
rel_true_train = true_train[true_train['rel_name']==target_relation].copy()
rel_true_train['true_label'] = np.ones(rel_true_train.shape[0])
train_data = train_data.merge(rel_true_train[['head', 'tail', 'true_label']], how='left', on=['head', 'tail'])
train_data = train_data.fillna(-1)

In [None]:
rel_true_test = true_test[true_test['rel_name']==target_relation].copy()
rel_true_test['true_label'] = np.ones(rel_true_test.shape[0])
test_data = test_data.merge(rel_true_test[['head', 'tail', 'true_label']], how='left', on=['head', 'tail'])
test_data = test_data.fillna(-1)

In [None]:
train_data

In [None]:
# separate x (features) and y (labels)
train_y = train_data.pop('label')
true_train_y = train_data.pop('true_label')
train_x = train_data.drop(['head', 'tail'], axis=1)

test_y = test_data.pop('label')
true_test_y = test_data.pop('true_label')
test_x = test_data.drop(['head', 'tail'], axis=1)

## Check data consistency

In [None]:
# Original data
rel_true_train = true_train[true_train['rel_name']==target_relation]
# Corrupted data 
corrupted_data = pd.read_csv(corrupted_data_path, sep=' ', skiprows=1, names=['e1', 'e2', 'rel', 'label'])
# Add relations and entities names to dataset
corrupted_data['rel_name'] = corrupted_data['rel'].apply(apply_id2relation)
corrupted_data['e1_name'] = corrupted_data['e1'].apply(apply_id2entity)
corrupted_data['e2_name'] = corrupted_data['e2'].apply(apply_id2entity)
rel_corrupted_data = corrupted_data[corrupted_data['rel_name']==target_relation]
# After spliting the data into relations
split = pd.read_csv(split_data_path + '/' + target_relation + '/' + 'train.tsv', sep='\t', skiprows=0, header=None)
# After applying pra
pra_output = pd.read_csv(train_matrix_fpath, sep='\t', skiprows=0, header=None)

In [None]:
print("Original data: ", rel_true_train.shape)
print("Corrupted: ", rel_corrupted_data.shape)
print("Split: ", split.shape)
print("Pra Output: ", pra_output.shape)
print("After Parsing: ", train_x.shape)

## Training a logistic regression model

Elastic net is a logistic regression model which combines L1 and L2 regularizations.

In [None]:
param_grid = [
  {'l1_ratio': [.1, .5, .7, .9, .95, .99, 1], 'alpha': [0.01, 0.001, 0.0001]}
]
#w_l1 = 0.5
#w_l2 = 0.05
#l1_ratio = w_l1 / (w_l1 + w_l2)
# alpha = w_l1 + w_l2
#alpha = 0.0001

model = SGDClassifier(loss="log", penalty="elasticnet",
                      max_iter=100000, tol=1e-3, class_weight="balanced")
clf = GridSearchCV(model, param_grid)

In [None]:
alpha = clf.best_params_['alpha']
l1_ratio = clf.best_params_['l1_ratio']

In [None]:
model.fit(train_x, train_y)

In [None]:
clf.score(test_x, test_y)

In [None]:
clf.score(test_x, true_test_y)

In [None]:
coefficients = model.coef_.reshape(-1,1) # normalize(abs(model.coef_), norm='l1', axis=1).reshape(-1,1)

In [None]:
most_relevant_variables = pd.DataFrame(coefficients, columns=['scores'])
most_relevant_variables['path'] = train_x.columns
np_train_x = train_x.apply(pd.to_numeric)
occurences = np.sum(np_train_x.as_matrix(), axis=0)
most_relevant_variables['occurences'] = occurences
most_relevant_variables = most_relevant_variables.sort_values(by="scores", ascending=False)

In [None]:
final_most_relevant = pd.concat([most_relevant_variables.iloc[0:15], most_relevant_variables.iloc[-15:-1]])

In [None]:
final_most_relevant

In [None]:
# Negative Influencers
most_relevant_variables.iloc[-15:-1]

In [None]:
relevant_relations = most_relevant_variables[most_relevant_variables['scores'] != 0].shape[0]
total_relations = most_relevant_variables.shape[0]
total_relations

In [None]:
repeated_coefficients = np.repeat(coefficients.T, train_x.shape[0], axis=0)
train_x = train_x.apply(pd.to_numeric)
explanations = train_x.mul(repeated_coefficients, axis=1)

In [None]:
def get_reasons(row):    
    reasons = row[row != 0]
    string = ''
    for reason, relevance in reasons.iteritems():
        string += str(reason) + " " + str(relevance) + " / "
    string = string[:-3]
    return string

In [None]:
def get_reasons2(row):    
    reasons = row[row != 0]
    output = pd.Series()
    counter = 1
    for reason, relevance in reasons.iteritems():
        output['reason' + str(counter)] = reason
        output['relevance' + str(counter)] = relevance
        counter = counter + 1
        if counter == 10:
            break
    for i in range(counter, 10):
        output['reason' + str(i)] = "n/a"
        output['relevance' + str(i)] = "n/a"
    return output

In [None]:
def explain(coefficients, data_type):
    if data_type == 'train':
        x = train_x
        y = train_y
        data = train_data
    else:
        x = test_x
        y = test_y
        data = test_data
    final_reasons = pd.DataFrame()
    final_reasons['head'] = data['head']
    final_reasons['tail'] = data['tail']
    repeated_coefficients = np.repeat(coefficients.T, x.shape[0], axis=0)
    weighted_x = x.apply(pd.to_numeric)
    explanations = weighted_x.mul(repeated_coefficients, axis=1)
    motives = explanations.apply(get_reasons2, axis=1)
    final_reasons = pd.concat([final_reasons, motives], axis=1)
    answers = model.predict_proba(x)[:, 1]
    final_reasons['y_hat'] = answers
    final_reasons['y'] = y
    return final_reasons

In [None]:
test_final_reasons = explain(coefficients, 'test')

In [None]:
with open('death_causes.txt', 'w+') as f:
    f.write(test_final_reasons.to_string())

In [None]:
test_final_reasons.to_csv("death_causes.csv")

In [None]:
test_final_reasons[(test_final_reasons['reason1']=='-gender-_gender-children-')]

In [None]:
model.predict_proba(test_x)

In [None]:
test_y.value_counts()

In [None]:
886./(886+92)

In [None]:
model.get_params()

## Testing the model with different metrics

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
precision_score(test_y, model.predict(test_x))

In [None]:
recall_score(test_y, model.predict(test_x))

In [None]:
f1_score(test_y, model.predict(test_x))

### Balanced vs imbalanced weights for classes

Let's see the difference between models when one don't balance the classes

In [None]:
model2 = SGDClassifier(loss="log", penalty="elasticnet", alpha=alpha, l1_ratio=l1_ratio,
                      max_iter=1000, tol=1e-3)
model2.fit(train_x, train_y)
f1_score(test_y, model2.predict(test_x))

In [None]:
model2.score(test_x, test_y)

In [None]:
precision_score(test_y, model2.predict(test_x))

In [None]:
recall_score(test_y, model2.predict(test_x))

From these experiments we see that using the option `class_weight="balanced"` favored recall over precision (which is consistent to [here](http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html)).

## Debugging the dataframes

Let's see if everything is ok with the feature (x) dataframes

In [None]:
len(train_x.columns)

In [None]:
len(test_x.columns)

In [None]:
set(train_x.columns) == set(test_x.columns)

Now let's check the labels (y)

In [None]:
len(train_y)

In [None]:
len(test_y)

In [None]:
train_y.describe()

Problem, the logistic regression is simply outputting the mean for all cases!!! Holly, man!

## Debugging the model

In [None]:
w_l1 = 0.5
w_l2 = 0.01
alpha = w_l1 + w_l2
l1_ratio = w_l1 / (w_l1 + w_l2)

# model = SGDClassifier(loss="log", penalty="elasticnet", alpha=alpha, l1_ratio=l1_ratio)
model = SGDClassifier(loss="log", penalty="elasticnet", l1_ratio=l1_ratio)

In [None]:
d1 = {
    "x1": 1,
    "x2": 1,
    "x3": 1,
}
d2 = {
    "x1": 1,
    "x2": 1,
    "x3": 1,
}
d3 = {
    "x1": 0,
    "x2": 0,
    "x3": 0,
}
d4 = {
    "x1": 1,
    "x2": 1,
    "x3": 0,
}
train_x = pd.DataFrame([d1, d2, d3, d4])
train_y = pd.DataFrame([1, 1, 0, 0])

In [None]:
model.fit(train_x, train_y)

In [None]:
d4 = {
    "x1": 1,
    "x2": 1,
    "x3": 0,
}
d5 = {
    "x1": 1,
    "x2": 0,
    "x3": 0,
}
test_x = pd.DataFrame([d4, d5])
test_y = pd.DataFrame([1, 0])

In [None]:
model.predict(test_x)

In [None]:
test_x

In [None]:
pd.read_csv('/Users/Alvinho/pra/extract_feat__neg_by_random/institution/institution.csv')