In [1]:
import nltk
from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
from nltk.parse import DependencyGraph
from nltk.parse import DependencyEvaluator

In [2]:
BASE_TRAIN_FILE = "UD_Hindi/hi-ud-train"
BASE_TEST_FILE = "UD_Hindi/hi-ud-test"
EXTENTION = ".conllu"
EXTRA_SUFFIX = "-extra"
REMOVE_SUFFIX = "-rem"

In [3]:
#Create data for origignal data + column 10 features
#Train Data
with open(BASE_TRAIN_FILE+EXTENTION,'r') as f, open(BASE_TRAIN_FILE+EXTRA_SUFFIX+EXTENTION,'w') as f2 :
    for line in f.read().split("\n\n"):
        for element in line.split("\n"):
            feats = element.split("\t")
            if len(feats) > 9:
                if feats[5] != "_":
                    feats[5] += "|" + feats[9]
                else:
                    feats[5] = feats[9]
            f2.write('\t'.join(feats))
            f2.write("\n")
            
        f2.write("\n")

In [4]:
#Create data for origignal data + column 10 features
#Test Data
with open(BASE_TEST_FILE+EXTENTION,'r') as f, open(BASE_TEST_FILE+EXTRA_SUFFIX+EXTENTION,'w') as f2 :
    for line in f.read().split("\n\n"):
        for element in line.split("\n"):
            feats = element.split("\t")
            if len(feats) > 9:
                if feats[5] != "_":
                    feats[5] += "|" + feats[9]
                else:
                    feats[5] = feats[9]
            f2.write('\t'.join(feats))
            f2.write("\n")
            
        f2.write("\n")

In [5]:
#Create data without morphological features
#Train Data
with open(BASE_TRAIN_FILE+EXTENTION,'r') as f, open(BASE_TRAIN_FILE+REMOVE_SUFFIX+EXTENTION,'w') as f2 :
    for line in f.read().split("\n\n"):
        for element in line.split("\n"):
            feats = element.split("\t")
            if len(feats) > 5:
                feats[5] = "_"
            f2.write('\t'.join(feats))
            f2.write("\n")
            
        f2.write("\n")

In [6]:
#Create data without morphological features
#Test Data
with open(BASE_TEST_FILE+EXTENTION,'r') as f, open(BASE_TEST_FILE+REMOVE_SUFFIX+EXTENTION,'w') as f2 :
    for line in f.read().split("\n\n"):
        for element in line.split("\n"):
            feats = element.split("\t")
            if len(feats) > 5:
                feats[5] = "_"
            f2.write('\t'.join(feats))
            f2.write("\n")
            
        f2.write("\n")

In [7]:
from sklearn.datasets import load_svmlight_file
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import tempfile
from os import remove
import pickle

class CustomTransitionParser(TransitionParser):
    def __init__(self, algorithm, training_classifier="svm"):
        TransitionParser.__init__(self, algorithm)
        self.training_classifier = training_classifier
    
    def train(self, depgraphs, modelfile, verbose=True):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            if self.training_classifier == "svm" :
                # The parameter is set according to the paper:
                # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
                # Todo : because of probability = True => very slow due to
                # cross-validation. Need to improve the speed here
                model = svm.SVC(
                    kernel='poly',
                    degree=2,
                    coef0=0,
                    gamma=0.2,
                    C=0.5,
                    verbose=verbose,
                    probability=True)
            elif self.training_classifier == "log":
                model = LogisticRegression(penalty='l2',
                    dual=False, tol=0.0001, C=1.0,
                    fit_intercept=True, intercept_scaling=1,
                    class_weight=None, random_state=42, solver='liblinear',
                    max_iter=100, multi_class='ovr', verbose=0, warm_start=False,
                    n_jobs=1)
            elif self.training_classifier == "mlp":
                model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(100,50,25,), random_state=1)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))

        finally:
            remove(input_file.name)

# Training With original data (without column 10 features)

In [8]:
train_data = DependencyGraph.load(BASE_TRAIN_FILE+EXTENTION)
test_data = DependencyGraph.load(BASE_TEST_FILE+EXTENTION)

  "The graph doesn't contain a node "


## Arc-Standard

In [9]:
parser_std_svm = CustomTransitionParser('arc-standard',"svm")
parser_std_svm.train(train_data,'temp.arcstd.model',verbose=True)
result_std_svm = parser_std_svm.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_svm, test_data)
arc_std_svm_1 = de.eval()

 Number of training examples : 501
 Number of valid (projective) examples : 477
[LibSVM]

In [10]:
print("DATASET - Original (Without column 10)")
print("ARC-STANDARD, SVM : ",arc_std_svm_1)

DATASET - Original (Without column 10)
ARC-STANDARD, SVM :  (0.8624338624338624, 0.7709750566893424)


In [11]:
parser_std_log = CustomTransitionParser('arc-standard',"log")
parser_std_log.train(train_data,'temp.arcstd.model',verbose=True)
result_std_log = parser_std_log.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_log, test_data)
arc_std_log_1 = de.eval()

 Number of training examples : 501
 Number of valid (projective) examples : 477


In [12]:
print("DATASET - Original (Without column 10)")
print("ARC-STANDARD, LOGISTIC REGRESSION : ",arc_std_log_1)

DATASET - Original (Without column 10)
ARC-STANDARD, LOGISTIC REGRESSION :  (0.8034769463340892, 0.6893424036281179)


In [13]:
parser_std_mlp = CustomTransitionParser('arc-standard',"mlp")
parser_std_mlp.train(train_data,'temp.arcstd.model',verbose=True)
result_std_mlp = parser_std_mlp.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_mlp, test_data)
arc_std_mlp_1 = de.eval()

 Number of training examples : 501
 Number of valid (projective) examples : 477


In [14]:
print("DATASET - Original (Without column 10)")
print("ARC-STANDARD, MLP : ",arc_std_mlp_1)

DATASET - Original (Without column 10)
ARC-STANDARD, MLP :  (0.8261526832955405, 0.7150415721844293)


## Arc-Eager

In [15]:
parser_eag_svm = CustomTransitionParser('arc-eager',"svm")
parser_eag_svm.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_svm = parser_eag_svm.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_svm, test_data)
arc_eag_svm_1 = de.eval()

 Number of training examples : 501
 Number of valid (projective) examples : 477
[LibSVM]

In [16]:
print("DATASET - Original (Without column 10)")
print("ARC-EAGER, SVM : ",arc_eag_svm_1)

DATASET - Original (Without column 10)
ARC-EAGER, SVM :  (0.8828420256991686, 0.7928949357520786)


In [17]:
parser_eag_log = CustomTransitionParser('arc-standard',"log")
parser_eag_log.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_log = parser_std_log.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_log, test_data)
arc_eag_log_1 = de.eval()

 Number of training examples : 501
 Number of valid (projective) examples : 477


In [18]:
print("DATASET - Original (Without column 10)")
print("ARC-EAGER, LOGISTIC REGRESSION : ",arc_eag_log_1)

DATASET - Original (Without column 10)
ARC-EAGER, LOGISTIC REGRESSION :  (0.8034769463340892, 0.6893424036281179)


In [19]:
parser_eag_mlp = CustomTransitionParser('arc-standard',"mlp")
parser_eag_mlp.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_mlp = parser_std_mlp.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_mlp, test_data)
arc_eag_mlp_1 = de.eval()

 Number of training examples : 501
 Number of valid (projective) examples : 477


In [20]:
print("DATASET - Original (Without column 10)")
print("ARC-EAGER, MLP : ",arc_eag_mlp_1)

DATASET - Original (Without column 10)
ARC-EAGER, MLP :  (0.8261526832955405, 0.7150415721844293)


# Training With original data + column 10 features

In [21]:
train_data = DependencyGraph.load(BASE_TRAIN_FILE+EXTRA_SUFFIX+EXTENTION)
test_data = DependencyGraph.load(BASE_TEST_FILE+EXTRA_SUFFIX+EXTENTION)

  "The graph doesn't contain a node "


## Arc-Standard

In [22]:
parser_std_svm = CustomTransitionParser('arc-standard',"svm")
parser_std_svm.train(train_data,'temp.arcstd.model',verbose=True)
result_std_svm = parser_std_svm.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_svm, test_data)
arc_std_svm_2 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478
[LibSVM]

In [23]:
print("DATASET - Original + column 10")
print("ARC-STANDARD, SVM : ",arc_std_svm_2)

DATASET - Original + column 10
ARC-STANDARD, SVM :  (0.9168556311413454, 0.8329554043839759)


In [24]:
parser_std_log = CustomTransitionParser('arc-standard',"log")
parser_std_log.train(train_data,'temp.arcstd.model',verbose=True)
result_std_log = parser_std_log.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_log, test_data)
arc_std_log_2 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [25]:
print("DATASET - Original + column 10")
print("ARC-STANDARD, LOGISTIC REGRESSION : ",arc_std_log_2)

DATASET - Original + column 10
ARC-STANDARD, LOGISTIC REGRESSION :  (0.873015873015873, 0.7755102040816326)


In [26]:
parser_std_mlp = CustomTransitionParser('arc-standard',"mlp")
parser_std_mlp.train(train_data,'temp.arcstd.model',verbose=True)
result_std_mlp = parser_std_mlp.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_mlp, test_data)
arc_std_mlp_2 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [27]:
print("DATASET - Original + column 10")
print("ARC-STANDARD, MLP : ",arc_std_mlp_2)

DATASET - Original + column 10
ARC-STANDARD, MLP :  (0.8820861678004536, 0.780045351473923)


## Arc-Eager

In [28]:
parser_eag_svm = CustomTransitionParser('arc-eager',"svm")
parser_eag_svm.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_svm = parser_eag_svm.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_svm, test_data)
arc_eag_svm_2 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478
[LibSVM]

In [29]:
print("DATASET - Original + column 10")
print("ARC-EAGER, SVM : ",arc_eag_svm_2)

DATASET - Original + column 10
ARC-EAGER, SVM :  (0.9115646258503401, 0.8261526832955405)


In [30]:
parser_eag_log = CustomTransitionParser('arc-standard',"log")
parser_eag_log.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_log = parser_std_log.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_log, test_data)
arc_eag_log_2 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [31]:
print("DATASET - Original + column 10")
print("ARC-EAGER, LOGISTIC REGRESSION : ",arc_eag_log_2)

DATASET - Original + column 10
ARC-EAGER, LOGISTIC REGRESSION :  (0.873015873015873, 0.7755102040816326)


In [32]:
parser_eag_mlp = CustomTransitionParser('arc-standard',"mlp")
parser_eag_mlp.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_mlp = parser_std_mlp.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_mlp, test_data)
arc_eag_mlp_2 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [33]:
print("DATASET - Original + column 10")
print("ARC-EAGER, MLP : ",arc_eag_mlp_2)

DATASET - Original + column 10
ARC-EAGER, MLP :  (0.8820861678004536, 0.780045351473923)


# Training Without MORPHOLOGICAL features

In [35]:
train_data = DependencyGraph.load(BASE_TRAIN_FILE+REMOVE_SUFFIX+EXTENTION)
test_data = DependencyGraph.load(BASE_TEST_FILE+REMOVE_SUFFIX+EXTENTION)

  "The graph doesn't contain a node "


## Arc-Standard

In [36]:
parser_std_svm = CustomTransitionParser('arc-standard',"svm")
parser_std_svm.train(train_data,'temp.arcstd.model',verbose=True)
result_std_svm = parser_std_svm.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_svm, test_data)
arc_std_svm_3 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478
[LibSVM]

In [37]:
print("DATASET - WITHOUT Morphological Features")
print("ARC-STANDARD, SVM : ",arc_std_svm_3)

DATASET - WITHOUT Morphological Features
ARC-STANDARD, SVM :  (0.8495842781557067, 0.7664399092970522)


In [38]:
parser_std_log = CustomTransitionParser('arc-standard',"log")
parser_std_log.train(train_data,'temp.arcstd.model',verbose=True)
result_std_log = parser_std_log.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_log, test_data)
arc_std_log_3 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [39]:
print("DATASET - WITHOUT Morphological Features")
print("ARC-STANDARD, LOGISTIC REGRESSION : ",arc_std_log_3)

DATASET - WITHOUT Morphological Features
ARC-STANDARD, LOGISTIC REGRESSION :  (0.800453514739229, 0.6923658352229781)


In [40]:
parser_std_mlp = CustomTransitionParser('arc-standard',"mlp")
parser_std_mlp.train(train_data,'temp.arcstd.model',verbose=True)
result_std_mlp = parser_std_mlp.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_std_mlp, test_data)
arc_std_mlp_3 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [41]:
print("DATASET - WITHOUT Morphological Features")
print("ARC-STANDARD, MLP : ",arc_std_mlp_3)

DATASET - WITHOUT Morphological Features
ARC-STANDARD, MLP :  (0.8065003779289494, 0.6969009826152683)


## Arc-Eager

In [42]:
parser_eag_svm = CustomTransitionParser('arc-eager',"svm")
parser_eag_svm.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_svm = parser_eag_svm.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_svm, test_data)
arc_eag_svm_3 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478
[LibSVM]

In [43]:
print("DATASET - WITHOUT Morphological Features")
print("ARC-EAGER, SVM : ",arc_eag_svm_3)

DATASET - WITHOUT Morphological Features
ARC-EAGER, SVM :  (0.8692365835222978, 0.7724867724867724)


In [44]:
parser_eag_log = CustomTransitionParser('arc-standard',"log")
parser_eag_log.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_log = parser_std_log.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_log, test_data)
arc_eag_log_3 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [45]:
print("DATASET - WITHOUT Morphological Features")
print("ARC-EAGER, LOGISTIC REGRESSION : ",arc_eag_log_3)

DATASET - WITHOUT Morphological Features
ARC-EAGER, LOGISTIC REGRESSION :  (0.800453514739229, 0.6923658352229781)


In [46]:
parser_eag_mlp = CustomTransitionParser('arc-standard',"mlp")
parser_eag_mlp.train(train_data,'temp.arcstd.model',verbose=True)
result_eag_mlp = parser_std_mlp.parse(test_data, 'temp.arcstd.model')
de = DependencyEvaluator(result_eag_mlp, test_data)
arc_eag_mlp_3 = de.eval()

 Number of training examples : 502
 Number of valid (projective) examples : 478


In [47]:
print("DATASET - WITHOUT Morphological Features")
print("ARC-EAGER, MLP : ",arc_eag_mlp_3)

DATASET - WITHOUT Morphological Features
ARC-EAGER, MLP :  (0.8065003779289494, 0.6969009826152683)
