In [58]:
import pandas as pd
import numpy as np
import os
from tools.feature_matrices import parse_feature_matrices
from tools import dataset_tools
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import normalize

## Inspect Original Dataset

In [20]:
dataset_path = './benchmarks/NELL186/'
corrupted_data_path = '/Users/Alvinho/Documents/benchmarks/NELL186/corrupted/train2id_bern_5to1.txt'

In [21]:
entity2id, id2entity = dataset_tools.read_name2id_file(dataset_path + 'entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file(dataset_path + 'relation2id.txt')

In [22]:
true_train = pd.read_csv(dataset_path + 'train2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
true_valid = pd.read_csv(dataset_path + 'valid2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
true_test = pd.read_csv(dataset_path + 'test2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])

valid_neg = pd.read_csv(dataset_path + 'valid2id_neg.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
test_neg = pd.read_csv(dataset_path + 'test2id_neg.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])

data = pd.concat([true_train, true_valid, true_test])

In [23]:
ents = set()
ents.update(data.e1.unique())
ents.update(data.e2.unique())

print('Entities: {}'.format(len(ents)))
print('Relations: {}'.format(len(data.rel.unique())))

print('\nTrain triples: {}'.format(len(true_train)))
print('Valid triples: {}'.format(len(true_valid)))
print('Test triples: {}'.format(len(true_test)))

print('\nAll triples: {}').format(len(data))

Entities: 14463
Relations: 186

Train triples: 31134
Valid triples: 5000
Test triples: 5000

All triples: 41134


In [24]:
def apply_id2relation(x):
    return id2relation[x]

def apply_id2entity(x):
    return id2entity[x]

In [40]:
# Add relations and entities names to dataset
true_test['rel_name'] = true_test['rel'].apply(apply_id2relation)
true_test['e1_name'] = true_test['e1'].apply(apply_id2entity)
true_test['e2_name'] = true_test['e2'].apply(apply_id2entity)
# Training data
true_train['rel_name'] = true_train['rel'].apply(apply_id2relation)
true_train['e1_name'] = true_train['e1'].apply(apply_id2entity)
true_train['e2_name'] = true_train['e2'].apply(apply_id2entity)

## Parsing the feature matrixes (tables)

In [29]:
target_relation = 'airportincity'
data_path = '/Users/Alvinho/Documents/1524632595/pra_explain/results/extract_feat__neg_by_random/concept:' + target_relation
split_data_path = '/Users/Alvinho/Documents/1524632595/pra_explain/splits/bern_5to1'

In [60]:
data_path = '/Users/Alvinho/Documents/1524632595/pra_explain/results/extract_feat__neg_by_random'
target_relations = os.listdir(data_path)

In [61]:
target_relations

['concept:specializationof',
 'concept:athletessuchasathletes',
 'concept:drughassideeffect',
 'concept:agriculturalproductcamefromcountry',
 'concept:ismultipleof',
 'concept:visualartistartmovement',
 'concept:attractionofcity',
 'concept:plantrepresentemotion',
 'concept:atlocation',
 'concept:sportsgamesport',
 'concept:personleadsgeopoliticalorganization',
 'concept:subpartoforganization',
 'concept:automobilemakerdealersincountry',
 'concept:personleadsorganization',
 'concept:agriculturalproductincludingagriculturalproduct',
 'concept:malemovedtostateorprovince',
 'concept:beveragecontainsprotein',
 'concept:weaponmadeincountry',
 'concept:parentofperson',
 'concept:organizationheadquarteredinstateorprovince',
 'concept:bankbankincountry',
 'concept:leaguestadiums',
 'concept:animalsuchasinsect',
 'concept:istallerthan',
 'concept:itemexistsatlocation',
 'concept:fishservedwithfood',
 'concept:musicartistgenre',
 'concept:professionusestool',
 'concept:citylocatedincountry',
 'c

In [56]:
train_matrix_fpath = data_path + "/train.tsv"
validation_matrix_fpath = data_path + "/validation.tsv"
test_matrix_fpath = data_path + "/test.tsv"
train_data, test_data = parse_feature_matrices(train_matrix_fpath, test_matrix_fpath)

In [57]:
test_data.head()

Unnamed: 0,-_concept:acquired-concept:hasofficeincity-_concept:automobilemakerdealersincity-concept:automobilemakerdealersincity-,-_concept:agriculturalproductcontainchemical-concept:agriculturalproductcamefromcountry-_concept:automobilemakerdealersincountry-concept:automobilemakerdealersincity-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:agriculturalproductcookedwithagriculturalproduct-concept:agriculturalproductgrowinginstateorprovince-_concept:atlocation-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:vegetableproductioninstateorprovince-_concept:automobilemakercardealersinstateorprovince-concept:automobilemakerdealersincity-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:vegetableproductioninstateorprovince-_concept:automobilemakercardealersinstateorprovince-concept:hasofficeincity-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:vegetableproductioninstateorprovince-concept:istallerthan-_concept:istallerthan-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:vegetableproductioninstateorprovince-concept:statelocatedingeopoliticallocation-_concept:locationlocatedwithinlocation-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:vegetableproductioninstateorprovince-concept:statelocatedingeopoliticallocation-_concept:subpartof-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:vegetableproductioninstateorprovince-concept:subpartof-_concept:locationlocatedwithinlocation-,-_concept:agriculturalproductcookedwithagriculturalproduct-concept:vegetableproductioninstateorprovince-concept:subpartof-_concept:subpartof-,...,-concept:worksfor-concept:headquarteredin-concept:cityliesonriver-concept:attractionofcity-,-concept:worksfor-concept:headquarteredin-concept:locationlocatedwithinlocation-_concept:citylocatedincountry-,-concept:worksfor-concept:headquarteredin-concept:subpartof-_concept:citylocatedincountry-,-concept:worksfor-concept:headquarteredin-concept:subpartof-_concept:locationlocatedwithinlocation-,-concept:worksfor-concept:organizationheadquarteredincity-_concept:hasofficeincity-concept:hasofficeincity-,-concept:worksfor-concept:subpartoforganization-_concept:subpartof-concept:organizationheadquarteredincity-,-concept:worksfor-concept:teamplayssport-concept:sportusesstadium-concept:atlocation-,head,label,tail
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,concept:airport:albuquerque_international_airport,1,concept:city:albuquerque
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,concept:airport:bangor_international,1,concept:city:bangor
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,concept:airport:blue_grass_airport,1,concept:city:lexington
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,concept:airport:boryspil,1,concept:city:kiev
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,concept:airport:calgary_international,1,concept:city:banff


In [38]:
# separate x (features) and y (labels)
train_x = train_data.drop(['head', 'tail', 'label'], axis=1)
train_y = train_data['label']
rel_true_train = true_train[true_train['rel_name']==target_relation]
test_x = test_data.drop(['head', 'tail', 'label'], axis=1)
test_y = test_data['label']

## Check data consistency

In [41]:
# Original data
rel_true_train = true_train[true_train['rel_name']=='concept:'+target_relation]
# Corrupted data 
corrupted_data = pd.read_csv(corrupted_data_path, sep=' ', skiprows=1, names=['e1', 'e2', 'rel', 'label'])
# Add relations and entities names to dataset
corrupted_data['rel_name'] = corrupted_data['rel'].apply(apply_id2relation)
corrupted_data['e1_name'] = corrupted_data['e1'].apply(apply_id2entity)
corrupted_data['e2_name'] = corrupted_data['e2'].apply(apply_id2entity)
rel_corrupted_data = corrupted_data[corrupted_data['rel_name']=='concept:'+target_relation]
# After spliting the data into relations
split = pd.read_csv(split_data_path + '/concept:' + target_relation + '/' + 'train.tsv', sep='\t', skiprows=0, header=None)
# After applying pra
pra_output = pd.read_csv(train_matrix_fpath, sep='\t', skiprows=0, header=None)

In [42]:
print("Original data: ", rel_true_train.shape)
print("Corrupted: ", rel_corrupted_data.shape)
print("Split: ", split.shape)
print("Pra Output: ", pra_output.shape)
print("After Parsing: ", train_x.shape)

('Original data: ', (128, 6))
('Corrupted: ', (768, 7))
('Split: ', (768, 3))
('Pra Output: ', (260, 3))
('After Parsing: ', (260, 1287))


In [160]:
21536.0/137793.0

0.15629240962893615

label    10
dtype: int64

In [53]:
np_train_x = train_x.apply(pd.to_numeric)
occurences = np.sum(np_train_x.as_matrix(), axis=1)

In [55]:
np_train_x.shape

(260, 1287)

In [126]:
rel_true_train.shape

(15566, 6)

In [127]:
train_x.shape

(31063, 1087)

In [130]:
rel_corrupted_data.shape

(46698, 7)

## Training a logistic regression model

Elastic net is a logistic regression model which combines L1 and L2 regularizations.

In [68]:
w_l1 = 0.5
w_l2 = 0.05
l1_ratio = w_l1 / (w_l1 + w_l2)
# alpha = w_l1 + w_l2
alpha = 0.0001

model = SGDClassifier(loss="log", penalty="elasticnet", alpha=alpha, l1_ratio=l1_ratio,
                      max_iter=100000, tol=1e-3, class_weight="balanced")

In [69]:
model.fit(train_x, train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.9090909090909091, learning_rate='optimal', loss='log',
       max_iter=100000, n_iter=None, n_jobs=1, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001, verbose=0,
       warm_start=False)

In [70]:
coefficients = model.coef_.reshape(-1,1) # normalize(abs(model.coef_), norm='l1', axis=1).reshape(-1,1)

In [126]:
most_relevant_variables = pd.DataFrame(coefficients, columns=['scores'])
most_relevant_variables['path'] = train_x.columns
np_train_x = train_x.apply(pd.to_numeric)
occurences = np.sum(np_train_x.as_matrix(), axis=0)
most_relevant_variables['occurences'] = occurences
most_relevant_variables = most_relevant_variables.sort_values(by="scores", ascending=False)

In [127]:
occurences.shape

(1032,)

In [128]:
# Positive influencers
most_relevant_variables.iloc[0:15]

Unnamed: 0,scores,path,occurences
639,5.976796,-gender-_gender-children-,1441.0
937,5.67024,-profession-_profession-children-,80.0
951,3.473223,-religion-_religion-_children-profession-,9.0
955,3.473223,-religion-_religion-parents-profession-,9.0
63,3.120817,-_children-gender-_gender-children-,66.0
515,2.747733,-_spouse-profession-,384.0
1019,2.70399,-spouse-profession-,382.0
813,2.674564,-parents-profession-,284.0
990,2.565879,-spouse-gender-_gender-children-,37.0
608,2.257576,-children-profession-,272.0


In [129]:
# Negative Influencers
most_relevant_variables.iloc[-15:-1]

Unnamed: 0,scores,path,occurences
206,-0.96575,-_location-place_of_death-_place_of_death-prof...,80.0
5,-0.966309,-_cause_of_death-gender-_gender-profession-,27.0
205,-0.966427,-_location-place_of_death-_place_of_birth-prof...,60.0
203,-0.966431,-_location-place_of_death-_location-profession-,54.0
549,-1.11501,-children-_parents-_children-profession-,29.0
553,-1.11501,-children-_parents-parents-profession-,29.0
326,-1.122993,-_place_of_birth-location-_place_of_death-prof...,30.0
116,-1.206169,-_ethnicity-gender-_gender-profession-,28.0
147,-1.32624,-_institution-gender-_gender-profession-,118.0
361,-1.429479,-_place_of_death-gender-_gender-profession-,647.0


In [75]:
relevant_relations = most_relevant_variables[most_relevant_variables['scores'] != 0].shape[0]
total_relations = most_relevant_variables.shape[0]
total_relations

1032

In [14]:
repeated_coefficients = np.repeat(coefficients.T, train_x.shape[0], axis=0)
train_x = train_x.apply(pd.to_numeric)
explanations = train_x.mul(repeated_coefficients, axis=1)

In [76]:
def get_reasons(row):    
    reasons = row[row != 0]
    string = ''
    for reason, relevance in reasons.iteritems():
        string += str(reason) + " " + str(relevance) + " / "
    string = string[:-3]
    return string

In [77]:
def get_reasons2(row):    
    reasons = row[row != 0]
    output = pd.Series()
    counter = 1
    for reason, relevance in reasons.iteritems():
        output['reason' + str(counter)] = reason
        output['relevance' + str(counter)] = relevance
        counter = counter + 1
        if counter == 10:
            break
    for i in range(counter, 10):
        output['reason' + str(i)] = "n/a"
        output['relevance' + str(i)] = "n/a"
    return output

In [78]:
def explain(coefficients, data_type):
    if data_type == 'train':
        x = train_x
        y = train_y
        data = train_data
    else:
        x = test_x
        y = test_y
        data = test_data
    final_reasons = pd.DataFrame()
    final_reasons['head'] = data['head']
    final_reasons['tail'] = data['tail']
    repeated_coefficients = np.repeat(coefficients.T, x.shape[0], axis=0)
    weighted_x = x.apply(pd.to_numeric)
    explanations = weighted_x.mul(repeated_coefficients, axis=1)
    motives = explanations.apply(get_reasons2, axis=1)
    final_reasons = pd.concat([final_reasons, motives], axis=1)
    answers = model.predict_proba(x)[:, 1]
    final_reasons['y_hat'] = answers
    final_reasons['y'] = y
    return final_reasons

In [79]:
test_final_reasons = explain(coefficients, 'test')

In [21]:
with open('death_causes.txt', 'w+') as f:
    f.write(test_final_reasons.to_string())

In [23]:
test_final_reasons.to_csv("death_causes.csv")

In [145]:
test_final_reasons[(test_final_reasons['reason1']=='-gender-_gender-children-')]

Unnamed: 0,head,tail,reason1,relevance1,reason2,relevance2,reason3,relevance3,reason4,relevance4,...,reason6,relevance6,reason7,relevance7,reason8,relevance8,reason9,relevance9,y_hat,y
2,vatroslav_lisinski,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
12,karel_krautgartner,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,1
16,giovanni_maria_nanino,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
17,ramon_rivero,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
25,disco_d,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
57,karl_friedrich_abel,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
59,john_serry_sr,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
67,william_augustine_ogden,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
107,benedetto_marcello,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1
112,samuel_coleridge-taylor,composer,-gender-_gender-children-,5.9768,,,,,,,...,,,,,,,,,0.992337,-1


In [81]:
model.score(test_x, test_y)

0.7224888035828535

In [195]:
model.predict_proba(test_x)

array([[0.15951309, 0.84048691],
       [0.85069454, 0.14930546],
       [0.18043772, 0.81956228],
       ...,
       [0.7358523 , 0.2641477 ],
       [0.7358523 , 0.2641477 ],
       [0.7358523 , 0.2641477 ]])

In [196]:
test_y.value_counts()

-1    1102
 1     624
Name: label, dtype: int64

In [39]:
886./(886+92)

0.9059304703476483

In [40]:
model.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': 'balanced',
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.9803921568627451,
 'learning_rate': 'optimal',
 'loss': 'log',
 'max_iter': 1000,
 'n_iter': None,
 'n_jobs': 1,
 'penalty': 'elasticnet',
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'verbose': 0,
 'warm_start': False}

## Testing the model with different metrics

In [41]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [133]:
precision_score(test_y, model.predict(test_x))

0.5362537764350453

In [134]:
recall_score(test_y, model.predict(test_x))

0.5689102564102564

In [135]:
f1_score(test_y, model.predict(test_x))

0.5520995334370139

### Balanced vs imbalanced weights for classes

Let's see the difference between models when one don't balance the classes

In [129]:
model2 = SGDClassifier(loss="log", penalty="elasticnet", alpha=alpha, l1_ratio=l1_ratio,
                      max_iter=1000, tol=1e-3)
model2.fit(train_x, train_y)
f1_score(test_y, model2.predict(test_x))

0.5754026354319179

In [130]:
model2.score(test_x, test_y)

0.6639629200463499

In [131]:
precision_score(test_y, model2.predict(test_x))

0.5296495956873315

In [132]:
recall_score(test_y, model2.predict(test_x))

0.6298076923076923

From these experiments we see that using the option `class_weight="balanced"` favored recall over precision (which is consistent to [here](http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html)).

## Debugging the dataframes

Let's see if everything is ok with the feature (x) dataframes

In [41]:
len(train_x.columns)

13262

In [42]:
len(test_x.columns)

13262

In [45]:
set(train_x.columns) == set(test_x.columns)

True

Now let's check the labels (y)

In [49]:
len(train_y)

3873

In [48]:
len(test_y)

978

In [146]:
train_y.describe()

count    12886.000000
mean         0.155673
std          0.987847
min         -1.000000
25%         -1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: label, dtype: float64

Problem, the logistic regression is simply outputting the mean for all cases!!! Holly, man!

## Debugging the model

In [24]:
w_l1 = 0.5
w_l2 = 0.01
alpha = w_l1 + w_l2
l1_ratio = w_l1 / (w_l1 + w_l2)

# model = SGDClassifier(loss="log", penalty="elasticnet", alpha=alpha, l1_ratio=l1_ratio)
model = SGDClassifier(loss="log", penalty="elasticnet", l1_ratio=l1_ratio)

In [53]:
d1 = {
    "x1": 1,
    "x2": 1,
    "x3": 1,
}
d2 = {
    "x1": 1,
    "x2": 1,
    "x3": 1,
}
d3 = {
    "x1": 0,
    "x2": 0,
    "x3": 0,
}
d4 = {
    "x1": 1,
    "x2": 1,
    "x3": 0,
}
train_x = pd.DataFrame([d1, d2, d3, d4])
train_y = pd.DataFrame([1, 1, 0, 0])

In [54]:
model.fit(train_x, train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.980392156863,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [55]:
d4 = {
    "x1": 1,
    "x2": 1,
    "x3": 0,
}
d5 = {
    "x1": 1,
    "x2": 0,
    "x3": 0,
}
test_x = pd.DataFrame([d4, d5])
test_y = pd.DataFrame([1, 0])

In [56]:
model.predict(test_x)

array([0, 0])

In [29]:
test_x

Unnamed: 0,x1,x2,x3
0,1,1,1
1,1,0,0


In [32]:
pd.read_csv('/Users/Alvinho/pra/extract_feat__neg_by_random/institution/institution.csv')

Unnamed: 0.1,Unnamed: 0,head,tail,reason1,relevance1,reason2,relevance2,reason3,relevance3,reason4,...,reason6,relevance6,reason7,relevance7,reason8,relevance8,reason9,relevance9,y_hat,y
0,0,george_whelan_anderson_jr,united_states_naval_academy,-gender-_gender-place_of_birth-,1.943923,,,,,,...,,,,,,,,,0.710797,-1
1,1,frank_g_slaughter,johns_hopkins_university,-gender-_gender-institution-,0.089957,-location-_location-institution-,1.771595,-location-_place_of_death-institution-,0.198750,-nationality-_location-institution-,...,-place_of_death-_location-institution-,0.400842,-place_of_death-_place_of_birth-institution-,0.592028,-profession-_profession-institution-,0.855479,,,0.984797,-1
2,2,daniel_patrick_moynihan,london_school_of_economics,-gender-_gender-institution-,0.089957,-location-_location-institution-,1.771595,-location-_place_of_death-institution-,0.198750,-nationality-_location-institution-,...,-place_of_death-_location-institution-,0.400842,-place_of_death-_place_of_death-institution-,0.642994,-profession-_profession-institution-,0.855479,-religion-_religion-institution-,0.877574,0.993937,-1
3,3,kermit_roosevelt_jr,harvard_university,-_children-institution-,2.114499,-gender-_gender-location-,1.972592,-parents-institution-,2.133994,,...,,,,,,,,,0.994384,-1
4,4,john_f_potter,phillips_exeter_academy,-gender-_gender-institution-,0.089957,-nationality-_nationality-institution-,0.935160,-profession-_profession-institution-,0.855479,,...,,,,,,,,,0.697608,-1
5,5,joseph_rotblat,university_of_liverpool,-gender-_gender-institution-,0.089957,-nationality-_nationality-institution-,0.935160,-place_of_death-_place_of_death-institution-,0.642994,,...,,,,,,,,,0.651002,-1
6,6,wilgott_theophil_odhner,royal_institute_of_technology,-gender-_gender-institution-,0.089957,-nationality-_nationality-institution-,0.935160,-profession-_profession-institution-,0.855479,,...,,,,,,,,,0.697608,-1
7,7,william_backhouse_astor_jr,columbia_university,-_children-institution-,2.114499,-parents-institution-,2.133994,,,,...,,,,,,,,,0.960978,-1
8,8,william_b_hornblower,columbia_law_school,-gender-_gender-institution-,0.089957,-institution-_institution-institution-,2.721412,-profession-_profession-institution-,0.855479,,...,,,,,,,,,0.932276,-1
9,9,charles_le_gendre,university_of_paris,-gender-_gender-institution-,0.089957,-nationality-_location-_spouse-institution-,-0.408303,-nationality-_location-spouse-institution-,-0.408303,-nationality-_nationality-institution-,...,,,,,,,,,0.443699,-1
