# Creating the hybrid classifier
(the script is written for hybridizing with BERT. Just replace 'BERT' with 'XLNet' to get hybridization with XLNet.)

## Producing the training (or testing) set

In [1]:
# Import libraries
from collections import defaultdict
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import metrics 
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
import numpy as np

In [13]:
# import the trained model
import pickle
filename = 'mlp_model_trained_on_bert.sav'
trained_model = pickle.load(open(filename, 'rb'))
mlp = trained_model



In [14]:
bert_path_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/bert/results/untransform/results-untransform-rho'
gkr_path = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/gkr4nli/data/untransformed/sick_test_untransform-results.csv' 
merged_file_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid/sick_input/untransform-rho'
output_path_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid/untransform/results-hy-nli-untransform-rho'

for rho in range(0,120,20):
    r = str(rho)
    for experiment in range(5):
        e = str(experiment)
        end = r + '-' + e + '.csv'
        bert_path = bert_path_base + end
        merged_file = merged_file_base + end
        output_path = output_path_base + end

        
        # first
        dlFile = open (bert_path, 'r')
        symbFile = open (gkr_path, 'r')
        mergedFile = open (merged_file, 'w')
        mergedFile.write("ID\tComplexCtxs\tContraFlag\tVeridical\tAntiveridical\tAveridical\tEquals\tSuperclass\tSubclass\tDisjoint\tTargetLabel\n")

        # second
        # Read the files and initialize parameters
        dlLines = dlFile.readlines()
        symbLines = symbFile.readlines()
        # Create a dictionary holding the predicted label of the DL model. Will need it for evaluation as well.
        dict_of_dl_labels = defaultdict()

        # third
        for line in dlLines:
            line = line.replace("\n", "")
            elements = line.split("\t")
            dict_of_dl_labels[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")


        # fourth
        for line in symbLines:
            if line.startswith("pair_ID"):
                continue
            line = line.replace("\n", "")
            elements = line.split("\t")
            elements = elements[0:-1]
            id_ = elements[0]
            gold_label = elements[1]
            features = elements[2:-1]
            symb_label = elements[-1]
            dl_label = dict_of_dl_labels[id_]
            target_label = ""
            if dl_label == gold_label and symb_label == gold_label:
                target_label = "B"
            elif dl_label == gold_label:
                target_label = "DL"
            elif symb_label == gold_label:
                target_label = "S"
            else:
                target_label = "N"
            mergedFile.write(id_+"\t"+"\t".join([str(f) for f in features])+"\t" + target_label+"\n")


        # Close all opened files.
        dlFile.close()
        symbFile.close()
        mergedFile.close()



        # evaluate
        dlFile_TEST = open(bert_path, "r")
        symbFile_TEST = open(gkr_path, "r")

        # Read the files
        dlLines_TEST = dlFile_TEST.readlines()
        symbLines_TEST = symbFile_TEST.readlines()

        # Create a dictionary holding the predicted label of the DL model.
        dict_of_dl_labels_TEST = defaultdict()
        # Create a dictionary holding the predicted label of the GKR4NLI.
        dict_of_symb_labels_TEST = defaultdict()
        # Create a dictionary holding the gold label.
        dict_of_gold_labels_TEST = defaultdict()


        # Go through the dl file and store the predicted label of each pair in a dictionary.
        for line in dlLines_TEST:
            line = line.replace("\n", "")
            elements = line.split("\t")
            dict_of_dl_labels_TEST[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")

            
        # Go through the symbolic file and store the predicted label of GKR4NLI and the gold labels of each pair in a dictionary.
        for line in symbLines_TEST:
            if line.startswith("pair_ID"):
                continue
            line = line.replace("\n", "")
            elements = line.split("\t")
            elements = elements[0:-1]
            id_ = elements[0]
            gold_label = elements[1]
            dict_of_gold_labels_TEST[id_] = gold_label
            symb_label = elements[-1]
            dict_of_symb_labels_TEST[id_] = symb_label


        # six
        # Read test set to evaluate on it.
        test_data = pd.read_csv(merged_file, sep='\t', header=0)

        # print ("Dataset Lenght:: ", len(test_data))
        # print ("Dataset Shape:: ", test_data.shape)

        # print ("Dataset:: ")
        # print (test_data.head())


        X_test = test_data.values[:, 1:-1]
        Y_test = test_data.values[:, -1]


        # seven
        # Predict labels for test set. The predicted labels are one of S, DL or B, expressing the component that the hybrid 
        # classifier predicted to get the inference relation right.
        predicted = mlp.predict(X_test)
        # Write the final results into a file for better error-analysis.
        outputFile = open(output_path, 'w')
        outputFile.write("pair_ID\tHybridLabel\tMappedLabel\tGoldLabel\n")

        i = 0
        correct = 0
        for pred in predicted:
            test_id = np.array2string(test_data.values[i:i+1,0])
            test_id = test_id.replace("'","").replace("[","").replace("]","")
            #print (test_id)
            i += 1
            dl_pred = dict_of_dl_labels_TEST[test_id]
            symb_pred = dict_of_symb_labels_TEST[test_id]
            # print(symb_pred)
            gold = dict_of_gold_labels_TEST[test_id]
            hybrid_pred = ""   
            # map hybrid prediction to a proper inference label
            # if you are using our trained classifier, you have to use the following code: (because we used slighlty different
            # abbreviations for each label -- B for BERT, R for rule-based and BR for bert/rule-based) 
            if pred == "B":
                hybrid_pred = dl_pred
            elif pred == "R":
                hybrid_pred = symb_pred
            elif pred == "BR":
                hybrid_pred = dl_pred
            # If you have trained your own model, please use following code (and abbreviations):
            # if pred == "DL":
            #    hybrid_pred = dl_pred
            # if pred == "DL":
            #    hybrid_pred = dl_pred
            # elif pred == "S":
            #    hybrid_pred = symb_pred
            # elif pred == "B":
            #    hybrid_pred = dl_pred        
            # Check how many hybrid labels are indeed the correct labels.
            #print (hybrid_pred+ " "+gold)
            outputFile.write(test_id+"\t"+pred+"\t"+hybrid_pred+"\t"+gold+"\n")
            # !!!!!!! if you are evaluating on HANS, you need the following line to merge C and N to N !!!!!!!!
            #hybrid_pred = hybrid_pred.replace("C", "N")
            if hybrid_pred == gold:
                correct += 1
            
        # print ("No of correct classifications: "+str(correct))
        # print ("Percentage of correct classifications: "+str(correct/(len(test_data))))
        print(r, e, correct/len(test_data))

        # dlFile_TEST.close()
        # symbLines_TEST.close()
        outputFile.close()

C
N
C
E
E
C
C
N
N
N
C
E
E
C
N
N
C
N
E
N
N
N
C
N
E
N
N
N
N
N
E
E
N
N
N
N
N
E
C
E
N
N
C
C
N
C
N
N
N
E
N
N
C
E
E
E
E
E
N
N
E
N
N
C
N
N
N
C
N
N
E
N
N
N
N
C
N
E
E
C
E
C
E
C
E
E
E
N
N
E
E
C
E
C
E
E
E
E
N
N
E
E
N
N
N
N
N
N
N
N
N
N
E
N
N
N
N
N
C
E
N
N
N
E
N
E
C
E
N
N
N
N
N
E
N
N
N
N
N
E
N
N
N
N
N
N
N
N
E
E
N
N
E
N
N
N
C
C
E
N
N
N
N
N
N
N
C
N
N
N
N
N
N
C
N
N
C
N
N
N
N
N
N
N
C
E
C
E
C
C
N
E
N
N
E
E
N
C
N
N
N
N
N
C
E
N
N
N
N
E
N
N
N
N
N
N
N
C
N
N
N
C
E
E
N
N
N
E
C
N
N
E
C
N
N
N
N
N
C
N
N
E
N
N
C
N
N
C
E
E
C
N
N
N
C
N
E
N
N
E
C
E
N
N
N
C
E
E
C
N
N
C
E
E
N
N
N
N
N
E
N
N
E
N
E
E
N
N
N
N
E
N
C
C
N
C
N
E
N
N
N
E
N
E
C
N
N
E
E
E
C
N
N
N
E
C
N
N
N
N
N
N
N
E
N
N
E
C
N
N
E
E
E
N
N
E
E
N
E
C
E
E
C
E
N
N
E
N
E
E
N
N
C
N
N
N
N
N
N
N
N
C
N
C
N
N
N
E
C
N
N
N
N
C
N
N
N
N
N
E
C
N
N
N
N
N
N
E
C
E
N
N
N
E
C
N
N
C
N
E
N
C
N
N
N
N
C
N
E
C
N
E
N
C
N
C
N
N
N
C
N
N
N
N
N
N
N
N
N
N
N
N
N
N
C
E
N
N
E
N
N
E
N
N
N
N
N
E
N
N
C
N
E
N
N
N
N
E
N
N
N
C
N
N
N
E
N
N
N
E
E
E
E
N
N
N
E
E
N
N
E
C
N
N
N
C
N
N
C
E
E
N
N
N
E
E
N
N
N
N


KeyboardInterrupt: 

In [None]:
# transformed

In [25]:
bert_path_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/bert/results/transform/results-transform-rho'
gkr_path = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/gkr4nli/data/transformed/sick_test_transform-results.csv' 
merged_file_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid/sick_input/transform-rho'
output_path_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid/transform/results-hy-nli-transform-rho'

for rho in range(0,120,20):
    r = str(rho)
    for experiment in range(5):
        e = str(experiment)
        end = r + '-' + e + '.csv'
        bert_path = bert_path_base + end
        merged_file = merged_file_base + end
        output_path = output_path_base + end

        
        # first
        dlFile = open (bert_path, 'r')
        symbFile = open (gkr_path, 'r')
        mergedFile = open (merged_file, 'w')
        mergedFile.write("ID\tComplexCtxs\tContraFlag\tVeridical\tAntiveridical\tAveridical\tEquals\tSuperclass\tSubclass\tDisjoint\tTargetLabel\n")

        # second
        # Read the files and initialize parameters
        dlLines = dlFile.readlines()
        symbLines = symbFile.readlines()
        # Create a dictionary holding the predicted label of the DL model. Will need it for evaluation as well.
        dict_of_dl_labels = defaultdict()

        # third
        for line in dlLines:
            line = line.replace("\n", "")
            elements = line.split("\t")
            dict_of_dl_labels[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")


        # fourth
        for line in symbLines:
            if line.startswith("pair_ID"):
                continue
            line = line.replace("\n", "")
            elements = line.split("\t")
            elements = elements[0:-1]
            id_ = elements[0]
            gold_label = elements[1]
            features = elements[2:-1]
            symb_label = elements[-1]
            if id_ not in dict_of_dl_labels.keys():
                continue
            dl_label = dict_of_dl_labels[id_]
            target_label = ""
            if dl_label == gold_label and symb_label == gold_label:
                target_label = "B"
            elif dl_label == gold_label:
                target_label = "DL"
            elif symb_label == gold_label:
                target_label = "S"
            else:
                target_label = "N"
            mergedFile.write(id_+"\t"+"\t".join([str(f) for f in features])+"\t" + target_label+"\n")


        # Close all opened files.
        dlFile.close()
        symbFile.close()
        mergedFile.close()



        # evaluate
        dlFile_TEST = open(bert_path, "r")
        symbFile_TEST = open(gkr_path, "r")

        # Read the files
        dlLines_TEST = dlFile_TEST.readlines()
        symbLines_TEST = symbFile_TEST.readlines()

        # Create a dictionary holding the predicted label of the DL model.
        dict_of_dl_labels_TEST = defaultdict()
        # Create a dictionary holding the predicted label of the GKR4NLI.
        dict_of_symb_labels_TEST = defaultdict()
        # Create a dictionary holding the gold label.
        dict_of_gold_labels_TEST = defaultdict()


        # Go through the dl file and store the predicted label of each pair in a dictionary.
        for line in dlLines_TEST:
            line = line.replace("\n", "")
            elements = line.split("\t")
            dict_of_dl_labels_TEST[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")

            
        # Go through the symbolic file and store the predicted label of GKR4NLI and the gold labels of each pair in a dictionary.
        for line in symbLines_TEST:
            if line.startswith("pair_ID"):
                continue
            line = line.replace("\n", "")
            elements = line.split("\t")
            id_ = elements[0]
            gold_label = elements[1]
            dict_of_gold_labels_TEST[id_] = gold_label
            symb_label = elements[-1]
            dict_of_symb_labels_TEST[id_] = symb_label


        # six
        # Read test set to evaluate on it.
        test_data = pd.read_csv(merged_file, sep='\t', header=0)

        # print ("Dataset Lenght:: ", len(test_data))
        # print ("Dataset Shape:: ", test_data.shape)

        # print ("Dataset:: ")
        # print (test_data.head())


        X_test = test_data.values[:, 1:-1]
        Y_test = test_data.values[:, -1]


        # seven
        # Predict labels for test set. The predicted labels are one of S, DL or B, expressing the component that the hybrid 
        # classifier predicted to get the inference relation right.
        predicted = mlp.predict(X_test)
        # Write the final results into a file for better error-analysis.
        outputFile = open(output_path, 'w')
        outputFile.write("pair_ID\tHybridLabel\tMappedLabel\tGoldLabel\n")

        i = 0
        correct = 0
        for pred in predicted:
            test_id = np.array2string(test_data.values[i:i+1,0])
            test_id = test_id.replace("'","").replace("[","").replace("]","")
            #print (test_id)
            i += 1
            dl_pred = dict_of_dl_labels_TEST[test_id]
            symb_pred = dict_of_symb_labels_TEST[test_id]
            gold = dict_of_gold_labels_TEST[test_id]
            hybrid_pred = ""   
            # map hybrid prediction to a proper inference label
            # if you are using our trained classifier, you have to use the following code: (because we used slighlty different
            # abbreviations for each label -- B for BERT, R for rule-based and BR for bert/rule-based) 
            if pred == "B":
                hybrid_pred = dl_pred
            elif pred == "R":
                hybrid_pred = symb_pred
            elif pred == "BR":
                hybrid_pred = dl_pred
            # If you have trained your own model, please use following code (and abbreviations):
            #if pred == "DL":
            #    hybrid_pred = dl_pred
            #elif pred == "S":
            #    hybrid_pred = symb_pred
            #elif pred == "B":
            #    hybrid_pred = dl_pred        
            # Check how many hybrid labels are indeed the correct labels.
            #print (hybrid_pred+ " "+gold)
            outputFile.write(test_id+"\t"+pred+"\t"+hybrid_pred+"\t"+gold+"\n")
            # !!!!!!! if you are evaluating on HANS, you need the following line to merge C and N to N !!!!!!!!
            #hybrid_pred = hybrid_pred.replace("C", "N")
            if hybrid_pred == gold:
                correct += 1
            
        # print ("No of correct classifications: "+str(correct))
        # print ("Percentage of correct classifications: "+str(correct/(len(test_data))))
        print(r, e, correct/len(test_data))

        # dlFile_TEST.close()
        # symbLines_TEST.close()
        outputFile.close()

0 0 0.6638359569179785
0 1 0.6628003314001657
0 2 0.6269676884838442
0 3 0.6648715824357913
0 4 0.6783347141673571
20 0 0.6806130903065452
20 1 0.6853769676884839
20 2 0.6901408450704225
20 3 0.6849627174813587
20 4 0.6673570836785419
40 0 0.6810273405136703
40 1 0.6812344656172328
40 2 0.6795774647887324
40 3 0.6895194697597349
40 4 0.679784589892295
60 0 0.68019884009942
60 1 0.6868268434134217
60 2 0.682062966031483
60 3 0.6864125932062966
60 4 0.6870339685169843
80 0 0.6804059652029826
80 1 0.6870339685169843
80 2 0.6845484672742337
80 3 0.6915907207953604
80 4 0.6924192212096106
100 0 0.6866197183098591
100 1 0.6816487158243579
100 2 0.6783347141673571
100 3 0.6878624689312345
100 4 0.6833057166528583


In [None]:
# retrain

In [31]:
# Read training set that was created in the previous section.
training_data = pd.read_csv('/home/adam/anu/comp4450/hy-nli/Hy-NLI/data/SICK/trial_and_train/SICK_trial_and_train_BERT_GKR4NLI_input_for_hybrid_classifier.csv', sep='\t',header=0)
    
print ("Dataset Lenght:: ", len(training_data))
print ("Dataset Shape:: ", training_data.shape)

print ("Dataset:: ")
print (training_data.head())

Dataset Lenght::  6213
Dataset Shape::  (6213, 11)
Dataset:: 
     ID  ComplexCtxs  ContraFlag  Veridical  Antiveridical  Averidical  \
0    4a            1           0          1              1           0   
1    4b            0           0          1              0           0   
2   24a            0           0          1              0           0   
3  105a            0           0          1              0           0   
4  116a            1           0          1              0           0   

   Equals  Superclass  Subclass  Disjoint TargetLabel  
0       1           0         0         0           B  
1       1           0         0         0           B  
2       1           1         0         0           B  
3       1           1         0         0           B  
4       0           1         0         0           B  


In [32]:
# Check how the labels are distributed in order to resample properly.
dl = training_data[training_data.TargetLabel=="DL"]
symbolic = training_data[training_data.TargetLabel=="S"]
both = training_data[training_data.TargetLabel=="B"]
none = training_data[training_data.TargetLabel=="N"]
training_data['TargetLabel'].value_counts()

B     4706
DL    1234
N      195
S       78
Name: TargetLabel, dtype: int64

In [33]:
# the pairs with DL and B target labels have to be downsampled to match the much fewer instances of S labels
# (SICK is a relatively easy corpus where most pairs can be solved equally well by the two approaches and the
# rest of them can be mostly solved by the DL model.)
dl_downsampled = resample(dl, 
                                replace=False,     # sample with replacement
                               n_samples=55,    # to match minority class 60
                             random_state=123) # reproducible results

both_downsampled = resample(both, 
                                replace=True,     # sample with replacement
                               n_samples=55,    # to match minority class 78
                             random_state=123) # reproducible results


resampled_set = pd.concat([symbolic,dl_downsampled, both_downsampled])
resampled_set['TargetLabel'].value_counts()

S     78
DL    55
B     55
Name: TargetLabel, dtype: int64

In [34]:
# Define training set
X_train = resampled_set.values[:, 1:-1]
Y_train = resampled_set.values[:,-1]

In [35]:
# # train a random forest for XplaiNLI
# from sklearn.ensemble import RandomForestClassifier

# clf_forest = RandomForestClassifier(n_estimators=30, max_depth=25, random_state=0)
# clf_forest.fit(X_train, Y_train)
# import pickle
# filename = 'hybrid_model_for_XplaiNLI.sav'
# pickle.dump(clf_forest, open(filename, 'wb'))

In [36]:
mlp = MLPClassifier(hidden_layer_sizes=(8,),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01)
mlp.fit(X_train, Y_train)

MLPClassifier(alpha=0.01, hidden_layer_sizes=(8,), learning_rate='adaptive',
              learning_rate_init=0.01, max_iter=1000)

In [37]:
bert_path_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/bert/results/untransform/results-untransform-rho'
gkr_path = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/gkr4nli/data/untransformed/sick_test_untransform-results.csv' 
merged_file_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid-retrained/sick_input/untransform-rho'
output_path_base = '/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid-retrained/untransform/results-hy-nli-untransform-rho'

for rho in range(0,120,20):
    r = str(rho)
    for experiment in range(5):
        e = str(experiment)
        end = r + '-' + e + '.csv'
        bert_path = bert_path_base + end
        merged_file = merged_file_base + end
        output_path = output_path_base + end

        
        # first
        dlFile = open (bert_path, 'r')
        symbFile = open (gkr_path, 'r')
        mergedFile = open (merged_file, 'w')
        mergedFile.write("ID\tComplexCtxs\tContraFlag\tVeridical\tAntiveridical\tAveridical\tEquals\tSuperclass\tSubclass\tDisjoint\tTargetLabel\n")

        # second
        # Read the files and initialize parameters
        dlLines = dlFile.readlines()
        symbLines = symbFile.readlines()
        # Create a dictionary holding the predicted label of the DL model. Will need it for evaluation as well.
        dict_of_dl_labels = defaultdict()

        # third
        for line in dlLines:
            line = line.replace("\n", "")
            elements = line.split("\t")
            dict_of_dl_labels[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")


        # fourth
        for line in symbLines:
            if line.startswith("pair_ID"):
                continue
            line = line.replace("\n", "")
            elements = line.split("\t")
            elements = elements[0:-1]
            id_ = elements[0]
            gold_label = elements[1]
            features = elements[2:-1]
            symb_label = elements[-1]
            dl_label = dict_of_dl_labels[id_]
            target_label = ""
            if dl_label == gold_label and symb_label == gold_label:
                target_label = "B"
            elif dl_label == gold_label:
                target_label = "DL"
            elif symb_label == gold_label:
                target_label = "S"
            else:
                target_label = "N"
            mergedFile.write(id_+"\t"+"\t".join([str(f) for f in features])+"\t" + target_label+"\n")


        # Close all opened files.
        dlFile.close()
        symbFile.close()
        mergedFile.close()



        # evaluate
        dlFile_TEST = open(bert_path, "r")
        symbFile_TEST = open(gkr_path, "r")

        # Read the files
        dlLines_TEST = dlFile_TEST.readlines()
        symbLines_TEST = symbFile_TEST.readlines()

        # Create a dictionary holding the predicted label of the DL model.
        dict_of_dl_labels_TEST = defaultdict()
        # Create a dictionary holding the predicted label of the GKR4NLI.
        dict_of_symb_labels_TEST = defaultdict()
        # Create a dictionary holding the gold label.
        dict_of_gold_labels_TEST = defaultdict()


        # Go through the dl file and store the predicted label of each pair in a dictionary.
        for line in dlLines_TEST:
            line = line.replace("\n", "")
            elements = line.split("\t")
            dict_of_dl_labels_TEST[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")

            
        # Go through the symbolic file and store the predicted label of GKR4NLI and the gold labels of each pair in a dictionary.
        for line in symbLines_TEST:
            if line.startswith("pair_ID"):
                continue
            line = line.replace("\n", "")
            elements = line.split("\t")
            id_ = elements[0]
            gold_label = elements[1]
            dict_of_gold_labels_TEST[id_] = gold_label
            symb_label = elements[-1]
            dict_of_symb_labels_TEST[id_] = symb_label


        # six
        # Read test set to evaluate on it.
        test_data = pd.read_csv(merged_file, sep='\t', header=0)

        # print ("Dataset Lenght:: ", len(test_data))
        # print ("Dataset Shape:: ", test_data.shape)

        # print ("Dataset:: ")
        # print (test_data.head())


        X_test = test_data.values[:, 1:-1]
        Y_test = test_data.values[:, -1]


        # seven
        # Predict labels for test set. The predicted labels are one of S, DL or B, expressing the component that the hybrid 
        # classifier predicted to get the inference relation right.
        predicted = mlp.predict(X_test)
        # Write the final results into a file for better error-analysis.
        outputFile = open(output_path, 'w')
        outputFile.write("pair_ID\tHybridLabel\tMappedLabel\tGoldLabel\n")

        i = 0
        correct = 0
        for pred in predicted:
            test_id = np.array2string(test_data.values[i:i+1,0])
            test_id = test_id.replace("'","").replace("[","").replace("]","")
            #print (test_id)
            i += 1
            dl_pred = dict_of_dl_labels_TEST[test_id]
            symb_pred = dict_of_symb_labels_TEST[test_id]
            gold = dict_of_gold_labels_TEST[test_id]
            hybrid_pred = ""   
            # map hybrid prediction to a proper inference label
            # if you are using our trained classifier, you have to use the following code: (because we used slighlty different
            # abbreviations for each label -- B for BERT, R for rule-based and BR for bert/rule-based) 
            if pred == "B":
                hybrid_pred = dl_pred
            elif pred == "R":
                hybrid_pred = symb_pred
            elif pred == "BR":
                hybrid_pred = dl_pred
            # If you have trained your own model, please use following code (and abbreviations):
            #if pred == "DL":
            #    hybrid_pred = dl_pred
            #elif pred == "S":
            #    hybrid_pred = symb_pred
            #elif pred == "B":
            #    hybrid_pred = dl_pred        
            # Check how many hybrid labels are indeed the correct labels.
            #print (hybrid_pred+ " "+gold)
            outputFile.write(test_id+"\t"+pred+"\t"+hybrid_pred+"\t"+gold+"\n")
            # !!!!!!! if you are evaluating on HANS, you need the following line to merge C and N to N !!!!!!!!
            #hybrid_pred = hybrid_pred.replace("C", "N")
            if hybrid_pred == gold:
                correct += 1
            
        # print ("No of correct classifications: "+str(correct))
        # print ("Percentage of correct classifications: "+str(correct/(len(test_data))))
        print(r, e, correct/len(test_data))

        # dlFile_TEST.close()
        # symbLines_TEST.close()
        outputFile.close()

0 0 0.20156959933911606
0 1 0.19991738950846757
0 2 0.19578686493184635
0 3 0.19991738950846757
0 4 0.2040479140850888
20 0 0.20012391573729862
20 1 0.20342833539859562
20 2 0.201363073110285
20 3 0.20177612556794713
20 4 0.201363073110285
40 0 0.20156959933911606
40 1 0.20177612556794713
40 2 0.19950433705080545
40 3 0.20177612556794713
40 4 0.20074349442379183
60 0 0.20115654688145396
60 1 0.20260223048327136
60 2 0.20095002065262288
60 3 0.1992978108219744
60 4 0.19971086327963652
80 0 0.2023957042544403
80 1 0.20177612556794713
80 2 0.20322180916976457
80 3 0.20198265179677818
80 4 0.20012391573729862
100 0 0.20074349442379183
100 1 0.19950433705080545
100 2 0.2003304419661297
100 3 0.20053696819496075
100 4 0.20177612556794713


In [4]:
# Open input and output files. The input files are the output files of the GKR4NLI system and the DL model.
dlFile = open ('/home/adam/anu/comp4450/hy-nli/Hy-NLI/data/SICK/test/SICK_test_BERT_results.csv', 'r')
symbFile = open ('/home/adam/anu/comp4450/hy-nli/Hy-NLI/data/SICK/test/SICK_test_GKR4NLI_results.csv', 'r')
mergedFile = open ('/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid/untransform/SICK_test_BERT_GKR4NLI_input_for_hybrid_classifier.csv', 'w')
mergedFile.write("ID\tComplexCtxs\tContraFlag\tVeridical\tAntiveridical\tAveridical\tEquals\tSuperclass\tSubclass\tDisjoint\tTargetLabel\n")


109

In [5]:
# Read the files and initialize parameters
dlLines = dlFile.readlines()
symbLines = symbFile.readlines()
# Create a dictionary holding the predicted label of the DL model. Will need it for evaluation as well.
dict_of_dl_labels = defaultdict()

In [6]:
# Go through the dl file and store the predicted label of each pair in a dictionary.
for line in dlLines:
    line = line.replace("\n", "")
    elements = line.split("\t")
    dict_of_dl_labels[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")
    

In [7]:
# Go through the symbolic file and compare the symbolic and the dl labels to gold and produce the final
# target label that will be learned by the classifier. Produce a merged file with the features and the target label.
for line in symbLines:
    if line.startswith("pair_ID"):
        continue
    line = line.replace("\n", "")
    elements = line.split("\t")
    id_ = elements[0]
    gold_label = elements[1]
    features = elements[2:-1]
    symb_label = elements[-1]
    dl_label = dict_of_dl_labels[id_]
    target_label = ""
    if dl_label == gold_label and symb_label == gold_label:
        target_label = "B"
    elif dl_label == gold_label:
        target_label = "DL"
    elif symb_label == gold_label:
        target_label = "S"
    else:
         target_label = "N"
    mergedFile.write(id_+"\t"+"\t".join([str(f) for f in features])+"\t" + target_label+"\n")



In [8]:
# Close all opened files.
dlFile.close()
symbFile.close()
mergedFile.close()

## Evaluating the classifier

In [9]:
# First, read the test set to store the symbolic, DL and gold labels of the pairs

# Open the files
dlFile_TEST = open('/home/adam/anu/comp4450/hy-nli/Hy-NLI/data/SICK/test/SICK_test_BERT_results.csv', "r")
symbFile_TEST = open('/home/adam/anu/comp4450/hy-nli/Hy-NLI/data/SICK/test/SICK_test_GKR4NLI_results.csv', "r")

# Read the files
dlLines_TEST = dlFile_TEST.readlines()
symbLines_TEST = symbFile_TEST.readlines()

# Create a dictionary holding the predicted label of the DL model.
dict_of_dl_labels_TEST = defaultdict()
# Create a dictionary holding the predicted label of the GKR4NLI.
dict_of_symb_labels_TEST = defaultdict()
# Create a dictionary holding the gold label.
dict_of_gold_labels_TEST = defaultdict()


# Go through the dl file and store the predicted label of each pair in a dictionary.
for line in dlLines_TEST:
    line = line.replace("\n", "")
    elements = line.split("\t")
    dict_of_dl_labels_TEST[elements[0]] = elements[1].replace("0", "E").replace("1", "C").replace("2", "N")

    
# Go through the symbolic file and store the predicted label of GKR4NLI and the gold labels of each pair in a dictionary.
for line in symbLines_TEST:
    if line.startswith("pair_ID"):
        continue
    line = line.replace("\n", "")
    elements = line.split("\t")
    id_ = elements[0]
    gold_label = elements[1]
    dict_of_gold_labels_TEST[id_] = gold_label
    symb_label = elements[-1]
    dict_of_symb_labels_TEST[id_] = symb_label


In [10]:
# Read test set to evaluate on it.
test_data = pd.read_csv('/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid/untransform/SICK_test_BERT_GKR4NLI_input_for_hybrid_classifier.csv', sep= '\t', header= 0)

print ("Dataset Lenght:: ", len(test_data))
print ("Dataset Shape:: ", test_data.shape)

print ("Dataset:: ")
print (test_data.head())


X_test = test_data.values[:, 1:-1]
Y_test = test_data.values[:, -1]


Dataset Lenght::  4674
Dataset Shape::  (4674, 11)
Dataset:: 
    ID  ComplexCtxs  ContraFlag  Veridical  Antiveridical  Averidical  Equals  \
0   6a            0           0          1              0           0       0   
1   7a            0           0          1              0           0       1   
2   8a            0           0          1              0           0       1   
3  10a            0           0          1              0           0       1   
4  11a            0           0          1              0           0       1   

   Superclass  Subclass  Disjoint TargetLabel  
0           0         1         0           B  
1           1         0         0           B  
2           1         0         0           B  
3           0         0         0           B  
4           1         0         0           B  


In [12]:
# Predict labels for test set. The predicted labels are one of S, DL or B, expressing the component that the hybrid 
# classifier predicted to get the inference relation right.
predicted = mlp.predict(X_test)
# Write the final results into a file for better error-analysis.
outputFile = open('/home/adam/anu/comp4450/hy-nli/Hy-NLI/output/hybrid/untransform/SICK_test_hybrid_results_with_BERT.csv', 'w')
outputFile.write("pair_ID\tHybridLabel\tMappedLabel\tGoldLabel\n")

i = 0
correct = 0
for pred in predicted:
    test_id = np.array2string(test_data.values[i:i+1,0])
    test_id = test_id.replace("'","").replace("[","").replace("]","")
    #print (test_id)
    i += 1
    dl_pred = dict_of_dl_labels_TEST[test_id]
    symb_pred = dict_of_symb_labels_TEST[test_id]
    gold = dict_of_gold_labels_TEST[test_id]
    hybrid_pred = ""   
    # map hybrid prediction to a proper inference label
    # if you are using our trained classifier, you have to use the following code: (because we used slighlty different
    # abbreviations for each label -- B for BERT, R for rule-based and BR for bert/rule-based) 
    if pred == "B":
        hybrid_pred = dl_pred
    elif pred == "R":
        hybrid_pred = symb_pred
    elif pred == "BR":
        hybrid_pred = dl_pred
    # If you have trained your own model, please use following code (and abbreviations):
    #if pred == "DL":
    #    hybrid_pred = dl_pred
    #elif pred == "S":
    #    hybrid_pred = symb_pred
    #elif pred == "B":
    #    hybrid_pred = dl_pred        
    # Check how many hybrid labels are indeed the correct labels.
    #print (hybrid_pred+ " "+gold)
    outputFile.write(test_id+"\t"+pred+"\t"+hybrid_pred+"\t"+gold+"\n")
    # !!!!!!! if you are evaluating on HANS, you need the following line to merge C and N to N !!!!!!!!
    #hybrid_pred = hybrid_pred.replace("C", "N")
    if hybrid_pred == gold:
        correct += 1

print ("No of correct classifications: "+str(correct))
print ("Percentage of correct classifications: "+str(correct/(len(test_data))))

outputFile.close()


No of correct classifications: 3965
Percentage of correct classifications: 0.8483097988874626
