In [1]:
from nltk.parse import DependencyGraph, DependencyEvaluator
from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
import tempfile, os
import re
import pickle
try:
    from numpy import array
    from scipy import sparse
    from sklearn.datasets import load_svmlight_file
    from sklearn import svm
except ImportError:
    pass

In [2]:
class TransitionParserCustom(TransitionParser):
    def train(self, depgraphs, modelfile, verbose=True):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=verbose,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            os.remove(input_file.name)

In [3]:
haveMorphoFeatures = False

file = "./UD_Hindi/hi-ud-train.conllu"
fd = open(file)
trainData = []
graph = ""
for line in fd:
    if line.strip() == "":
        trainData.append(graph)
        graph = ""
        continue
        
    cols = re.split("\t",line)
    if haveMorphoFeatures:
        cols[5] = cols[5] + '|' + cols[9][:-1]
    else:
        cols[5] = '_'
    #print(cols)
    finalLine = ""
    for i in cols:
        finalLine += i + "\t"
    finalLine = finalLine[:-1]
    finalLine += "\n"
    graph = graph + finalLine
    
        
filet = "./UD_Hindi/hi-ud-test.conllu"
fd = open(filet)
testData = []
graph = ""
for line in fd:
    if line.strip() == "":
        testData.append(graph)
        graph = ""
        continue
        
    cols = re.split("\t",line)
    if haveMorphoFeatures:
        cols[5] = cols[5] + '|' + cols[9][:-1]
    else:
        cols[5] = '_'
    #print(cols)
    finalLine = ""
    for i in cols:
        finalLine += i + "\t"
    finalLine = finalLine[:-1]
    finalLine += "\n"
    graph = graph + finalLine
#print(trainData)

In [4]:
trainDataGraph = []
for t in trainData:
    d = DependencyGraph(t)
    trainDataGraph.append(d)
    
testDataGraph = []
for t in testData:
    d = DependencyGraph(t)
    testDataGraph.append(d)    

  "The graph doesn't contain a node "


In [5]:
input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False)

In [6]:
parser_std = TransitionParserCustom('arc-standard')

In [7]:
parser_std._create_training_examples_arc_std(trainDataGraph, input_file)

 Number of training examples : 500
 Number of valid (projective) examples : 476


['SHIFT',
 'LEFTARC:det',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'SHIFT',
 'LEFTARC:advmod',
 'SHIFT',
 'LEFTARC:amod',
 'LEFTARC:nmod',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'LEFTARC:nmod',
 'SHIFT',
 'RIGHTARC:cop',
 'SHIFT',
 'RIGHTARC:punct',
 'RIGHTARC:root',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compound',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'LEFTARC:nsubj',
 'LEFTARC:obj',
 'SHIFT',
 'RIGHTARC:aux',
 'SHIFT',
 'RIGHTARC:punct',
 'RIGHTARC:root',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compound',
 'LEFTARC:nmod',
 'SHIFT',
 'SHIFT',
 'LEFTARC:nummod',
 'LEFTARC:nsubj',
 'SHIFT',
 'RIGHTARC:cop',
 'SHIFT',
 'RIGHTARC:punct',
 'RIGHTARC:root',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:nummod',
 'LEFTARC:nmod',
 'SHIFT',
 'RIGHTARC:cop',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compound',
 'LEFTARC:amod',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'SHIFT',
 'RIGHTARC:mark',
 'SHIFT',
 'RIGHTARC:mark',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compo

In [8]:
parser_std.train(trainDataGraph,'temp.arcstd.model', verbose=False)

 Number of training examples : 500
 Number of valid (projective) examples : 476


In [9]:
result = parser_std.parse(testDataGraph, 'temp.arcstd.model')

In [10]:
de = DependencyEvaluator(result, testDataGraph)

In [11]:
de.eval()

(0.8488284202569917, 0.7656840513983371)

In [12]:
parser_std = TransitionParserCustom('arc-eager')

In [13]:
parser_std._create_training_examples_arc_std(trainDataGraph, input_file)

 Number of training examples : 500
 Number of valid (projective) examples : 476


['SHIFT',
 'LEFTARC:det',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'SHIFT',
 'LEFTARC:advmod',
 'SHIFT',
 'LEFTARC:amod',
 'LEFTARC:nmod',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'LEFTARC:nmod',
 'SHIFT',
 'RIGHTARC:cop',
 'SHIFT',
 'RIGHTARC:punct',
 'RIGHTARC:root',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compound',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'LEFTARC:nsubj',
 'LEFTARC:obj',
 'SHIFT',
 'RIGHTARC:aux',
 'SHIFT',
 'RIGHTARC:punct',
 'RIGHTARC:root',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compound',
 'LEFTARC:nmod',
 'SHIFT',
 'SHIFT',
 'LEFTARC:nummod',
 'LEFTARC:nsubj',
 'SHIFT',
 'RIGHTARC:cop',
 'SHIFT',
 'RIGHTARC:punct',
 'RIGHTARC:root',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:nummod',
 'LEFTARC:nmod',
 'SHIFT',
 'RIGHTARC:cop',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compound',
 'LEFTARC:amod',
 'SHIFT',
 'RIGHTARC:case',
 'SHIFT',
 'SHIFT',
 'RIGHTARC:mark',
 'SHIFT',
 'RIGHTARC:mark',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'LEFTARC:compo

In [14]:
parser_std.train(trainDataGraph,'temp.arceager.model', verbose=False)

 Number of training examples : 500
 Number of valid (projective) examples : 476


In [15]:
os.remove(input_file.name)

In [16]:
result = parser_std.parse(testDataGraph, 'temp.arceager.model')

In [17]:
de = DependencyEvaluator(result, testDataGraph)

In [18]:
de.eval()

(0.8662131519274376, 0.7702191987906274)