In [1]:
from nltk.parse import DependencyGraph, DependencyEvaluator
from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
import tempfile, os
from os import remove
import re
import pickle
try:
    from numpy import array
    from scipy import sparse
    from sklearn.datasets import load_svmlight_file
    from sklearn import svm
    from sklearn import linear_model
    from sklearn import neural_network
except ImportError:
    pass


  from collections import Sequence
  from collections import Sequence
  from collections import Sequence


In [2]:
class TransitionParserCustom(TransitionParser):
    def train(self, depgraphs, modelfile,modelType='logistic', njobs = 1,verbose=True):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            model = None
            if modelType == 'logistic': 
                model = linear_model.LogisticRegression(
                    C=0.5,
                    verbose=verbose,
                    n_jobs=njobs,
                    solver='lbfgs'
                )
            elif modelType == 'MLP':
                model = neural_network.MLPClassifier()
            elif modelType == 'SVM':
                model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=verbose,
                probability=True)

            
            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            os.remove(input_file.name)

In [3]:

trainFile = "./UD_Hindi/hi-ud-train.conllu"
testFile = "./UD_Hindi/hi-ud-test.conllu"
def loadDataIntoDependencyGraph(trainFile, testFile, haveMorphoFeatures = True):

    fd = open(trainFile)
    trainData = []
    graph = ""
    for line in fd:
        if line.strip() == "":
            trainData.append(graph)
            graph = ""
            continue

        cols = re.split("\t",line)
        if haveMorphoFeatures:
            cols[5] = cols[5] + '|' + cols[9][:-1]
        else:
            cols[5] = '_'
        #print(cols)
        finalLine = ""
        for i in cols:
            finalLine += i + "\t"
        finalLine = finalLine[:-1]
        finalLine += "\n"
        graph = graph + finalLine

    fd = open(testFile)
    testData = []
    graph = ""
    for line in fd:
        if line.strip() == "":
            testData.append(graph)
            graph = ""
            continue

        cols = re.split("\t",line)
        if haveMorphoFeatures:
            cols[5] = cols[5] + '|' + cols[9][:-1]
        else:
            cols[5] = '_'
        #print(cols)
        finalLine = ""
        for i in cols:
            finalLine += i + "\t"
        finalLine = finalLine[:-1]
        finalLine += "\n"
        graph = graph + finalLine
    
    trainDataGraph = []
    for t in trainData:
        d = DependencyGraph(t)
        trainDataGraph.append(d)

    testDataGraph = []
    for t in testData:
        d = DependencyGraph(t)
        testDataGraph.append(d)    

    return trainDataGraph, testDataGraph

In [4]:
# With morphological features
trainDataGraphMorpho, testDataGraphMorpho = loadDataIntoDependencyGraph(trainFile,testFile)
#Without morphological features
trainDataGraph, testDataGraph = loadDataIntoDependencyGraph(trainFile,testFile,haveMorphoFeatures=False)

  "The graph doesn't contain a node "
  "The graph doesn't contain a node "
  "The graph doesn't contain a node "


In [5]:
parser_std = TransitionParserCustom('arc-standard')

parser_std.train(trainDataGraph,'temp.arcstd.model',modelType='SVM', verbose=False)
result_std = parser_std.parse(testDataGraph, 'temp.arcstd.model')

parser_std.train(trainDataGraphMorpho,'temp.arcstd.morpho.model',modelType='SVM', verbose=False)
result_std_morpho = parser_std.parse(testDataGraphMorpho, 'temp.arcstd.morpho.model')

 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476


In [6]:
parser_eager = TransitionParserCustom('arc-eager')

parser_eager.train(trainDataGraph,'temp.arceager.model',modelType='SVM', verbose=False)
result_eager = parser_eager.parse(testDataGraph, 'temp.arceager.model')

parser_eager.train(trainDataGraphMorpho,'temp.arceager.morpho.model',modelType='SVM', verbose=False)
result_eager_morpho = parser_eager.parse(testDataGraphMorpho, 'temp.arceager.morpho.model')

 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476


In [7]:
de1 = DependencyEvaluator(result_std, testDataGraph)
de2 = DependencyEvaluator(result_std_morpho, testDataGraphMorpho)
de3 = DependencyEvaluator(result_eager, testDataGraph)
de4 = DependencyEvaluator(result_eager_morpho, testDataGraphMorpho)

In [8]:
print(de1.eval())
print(de2.eval())
print(de3.eval())
print(de4.eval())

(0.8473167044595616, 0.764928193499622)
(0.9123204837490552, 0.8337112622826909)
(0.8662131519274376, 0.7687074829931972)
(0.9123204837490552, 0.8276643990929705)
(0.8473167044595616, 0.764928193499622)
(0.9123204837490552, 0.8337112622826909)
(0.8662131519274376, 0.7687074829931972)
(0.9123204837490552, 0.8276643990929705)
(0.8473167044595616, 0.764928193499622)
(0.9123204837490552, 0.8337112622826909)
(0.8662131519274376, 0.7687074829931972)
(0.9123204837490552, 0.8276643990929705)


In [9]:
# Logistic
parser_std = TransitionParserCustom('arc-standard')

parser_std.train(trainDataGraph,'temp.arcstd.model',njobs=48, verbose=False)
result_std_logistic = parser_std.parse(testDataGraph, 'temp.arcstd.model')

parser_std.train(trainDataGraphMorpho,'temp.arcstd.morpho.model',njobs=48, verbose=False)
result_std_morpho_logistic = parser_std.parse(testDataGraphMorpho, 'temp.arcstd.morpho.model')

 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476


In [10]:
parser_eager = TransitionParserCustom('arc-eager')

parser_eager.train(trainDataGraph,'temp.arceager.model', njobs=48, verbose=False)
result_eager_logistic = parser_eager.parse(testDataGraph, 'temp.arceager.model')

parser_eager.train(trainDataGraphMorpho,'temp.arceager.morpho.model',njobs=48, verbose=False)
result_eager_morpho_logistic = parser_eager.parse(testDataGraphMorpho, 'temp.arceager.morpho.model')

 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476


In [11]:
de5 = DependencyEvaluator(result_std_logistic, testDataGraph)
de6 = DependencyEvaluator(result_std_morpho_logistic, testDataGraphMorpho)
de7 = DependencyEvaluator(result_eager_logistic, testDataGraph)
de8 = DependencyEvaluator(result_eager_morpho_logistic, testDataGraphMorpho)

In [12]:
print(de5.eval())
print(de6.eval())
print(de7.eval())
print(de8.eval())

(0.7928949357520786, 0.6817838246409675)
(0.8669690098261527, 0.7671957671957672)
(0.8435374149659864, 0.7278911564625851)
(0.9024943310657596, 0.8027210884353742)
(0.7928949357520786, 0.6817838246409675)
(0.8669690098261527, 0.7671957671957672)
(0.8435374149659864, 0.7278911564625851)
(0.9024943310657596, 0.8027210884353742)
(0.7928949357520786, 0.6817838246409675)
(0.8669690098261527, 0.7671957671957672)
(0.8435374149659864, 0.7278911564625851)
(0.9024943310657596, 0.8027210884353742)


In [13]:
# NN MLP
parser_std = TransitionParserCustom('arc-standard')

parser_std.train(trainDataGraph,'temp.arcstd.model', modelType='MLP',verbose=False)
result_std_mlp = parser_std.parse(testDataGraph, 'temp.arcstd.model')

parser_std.train(trainDataGraphMorpho,'temp.arcstd.morpho.model',modelType='MLP', verbose=False)
result_std_morpho_mlp = parser_std.parse(testDataGraphMorpho, 'temp.arcstd.morpho.model')

 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476


In [14]:
parser_eager = TransitionParserCustom('arc-eager')

parser_eager.train(trainDataGraph,'temp.arceager.model',modelType='MLP', verbose=False)
result_eager_mlp = parser_eager.parse(testDataGraph, 'temp.arceager.model')

parser_eager.train(trainDataGraphMorpho,'temp.arceager.morpho.model',modelType='MLP', verbose=False)
result_eager_morpho_mlp = parser_eager.parse(testDataGraphMorpho, 'temp.arceager.morpho.model')

 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476
 Number of training examples : 500
 Number of valid (projective) examples : 476


In [15]:
de9 = DependencyEvaluator(result_std_mlp, testDataGraph)
de10 = DependencyEvaluator(result_std_morpho_mlp, testDataGraphMorpho)
de11 = DependencyEvaluator(result_eager_mlp, testDataGraph)
de12 = DependencyEvaluator(result_eager_morpho_mlp, testDataGraphMorpho)

In [16]:
print(de9.eval())
print(de10.eval())
print(de11.eval())
print(de12.eval())

(0.8012093726379441, 0.690854119425548)
(0.8677248677248677, 0.7671957671957672)
(0.8450491307634165, 0.7218442932728647)
(0.8767951625094482, 0.7762660619803476)
(0.8012093726379441, 0.690854119425548)
(0.8677248677248677, 0.7671957671957672)
(0.8450491307634165, 0.7218442932728647)
(0.8767951625094482, 0.7762660619803476)
(0.8012093726379441, 0.690854119425548)
(0.8677248677248677, 0.7671957671957672)
(0.8450491307634165, 0.7218442932728647)
(0.8767951625094482, 0.7762660619803476)


In [17]:
print("SVM")
print("No Morpho STD")
print(de1.eval())
print("Morpho STD")
print(de2.eval())
print("No Morpho EAGER")
print(de3.eval())
print("Morpho EAGER")
print(de4.eval())


print("Logistic")
print("No Morpho STD")
print(de5.eval())
print("Morpho STD")
print(de6.eval())
print("No Morpho EAGER")
print(de7.eval())
print("Morpho EAGER")
print(de8.eval())


print("MLP")
print("No Morpho STD")
print(de9.eval())
print("Morpho STD")
print(de10.eval())
print("No Morpho EAGER")
print(de11.eval())
print("Morpho EAGER")
print(de12.eval())

SVM
No Morpho STD
(0.8473167044595616, 0.764928193499622)
Morpho STD
(0.9123204837490552, 0.8337112622826909)
No Morpho EAGER
(0.8662131519274376, 0.7687074829931972)
Morpho EAGER
(0.9123204837490552, 0.8276643990929705)
Logistic
No Morpho STD
(0.7928949357520786, 0.6817838246409675)
Morpho STD
(0.8669690098261527, 0.7671957671957672)
No Morpho EAGER
(0.8435374149659864, 0.7278911564625851)
Morpho EAGER
(0.9024943310657596, 0.8027210884353742)
MLP
No Morpho STD
(0.8012093726379441, 0.690854119425548)
Morpho STD
(0.8677248677248677, 0.7671957671957672)
No Morpho EAGER
(0.8450491307634165, 0.7218442932728647)
Morpho EAGER
(0.8767951625094482, 0.7762660619803476)
SVM
No Morpho STD
(0.8473167044595616, 0.764928193499622)
Morpho STD
(0.9123204837490552, 0.8337112622826909)
No Morpho EAGER
(0.8662131519274376, 0.7687074829931972)
Morpho EAGER
(0.9123204837490552, 0.8276643990929705)
Logistic
No Morpho STD
(0.7928949357520786, 0.6817838246409675)
Morpho STD
(0.8669690098261527, 0.767195767195