In [1]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics




In [2]:
# Load the Penn Treebank corpus
nltk.download('treebank')
corpus = nltk.corpus.treebank.tagged_sents()
print(corpus)

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


[nltk_data] Downloading package treebank to
[nltk_data]     /Users/anishka/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [3]:
from datasets import load_dataset

corpus = load_dataset("universal_dependencies", "te_mtg")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset universal_dependencies (/Users/anishka/.cache/huggingface/datasets/universal_dependencies/te_mtg/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7)
100%|██████████| 3/3 [00:00<00:00, 199.55it/s]


In [6]:
from conllu import parse_incr

In [17]:
data_file = open("ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu", "r", encoding="utf-8")
    
ud_files = []
for tokenlist in parse_incr(data_file):
    ud_files.append(tokenlist)

ud_treebank=[]
for sentence in ud_files:
    tokens = []
    tags = []
    for token in sentence:
        tokens.append(token["misc"]["Translit"])
        tags.append(token["upostag"])
    ud_treebank.append([tokens, tags])

print (ud_treebank)

[[['cūserā', 'aṁḍī', '?'], ['VERB', 'PART', 'PUNCT']], [['èkkaḍiki', 'vèḷtunnāraṁḍī', '?'], ['NOUN', 'VERB', 'PUNCT']], [['èppuḍoy', 'amèrikā', 'niṁci', 'rāvaṭaṁ', '?'], ['PRON', 'PROPN', 'ADP', 'VERB', 'PUNCT']], [['èṁdukayyā', 'ī', 'bādha', '?'], ['PRON', 'DET', 'NOUN', 'PUNCT']], [['iṁṭiki', 'porā', '!'], ['NOUN', 'VERB', 'PUNCT']], [['poyi', 'nī', 'tātato', 'cèpparā', '!'], ['VERB', 'PRON', 'NOUN', 'VERB', 'PUNCT']], [['baḷḷu', 'siddhaṁ', 'ceyaṁḍirā', '!'], ['NOUN', 'VERB', 'VERB', 'PUNCT']], [['poyi', 'mī', 'panulu', 'cesukoṁḍirā', '!'], ['VERB', 'PRON', 'NOUN', 'VERB', 'PUNCT']], [['abaddaṁ', 'cèbutānuṭarā', '?'], ['NOUN', 'VERB', 'PUNCT']], [['ī', 'rèṁḍu', 'akṣarālu', 'diddukove', '!'], ['DET', 'NUM', 'NOUN', 'VERB', 'PUNCT']], [['vāṇṇi', 'kūḍā', 'pilavave', '!'], ['PRON', 'ADP', 'VERB', 'PUNCT']], [['èkkaḍa', 'unnāvammā', 'ippaṭidākā', '?'], ['ADV', 'VERB', 'ADV', 'PUNCT']], [['rāmu', 'èlluṁḍi', 'madrāsu', 'vèḷtāḍu', '.'], ['PROPN', 'ADV', 'PROPN', 'VERB', 'PUNCT']], [['rāmu', 

In [28]:
# Define a function to extract features for each word in a sentence
def word_features(sentence, i):
	word = sentence[i]
	features = {
		'word': word,
		'is_first': i == 0, #if the word is a first word
		'is_last': i == len(sentence) - 1, #if the word is a last word
		'is_capitalized': word[0].upper() == word[0],
		'is_all_caps': word.upper() == word,	 #word is in uppercase
		'is_all_lower': word.lower() == word,	 #word is in lowercase
		#prefix of the word
		'prefix-1': word[0], 
		'prefix-2': word[:2],
		'prefix-3': word[:3],
		#suffix of the word
		'suffix-1': word[-1],
		'suffix-2': word[-2:],
		'suffix-3': word[-3:],
		#extracting previous word
		'prev_word': '' if i == 0 else sentence[i-1],
		#extracting next word
		'next_word': '' if i == len(sentence)-1 else sentence[i+1],
		'has_hyphen': '-' in word, #if word has hypen
		'is_numeric': word.isdigit(), #if word is in numeric
		'capitals_inside': word[1:].lower() != word[1:]
	}
	return features


In [29]:
# Extract features for each sentence in the corpus
def transform_to_dataset(treebank):
	X = []
	y = []
	for sentence, tags in treebank:
		X_sentence = []
		y_sentence = []
		for i in range(len(sentence)):
			X_sentence.append(word_features(sentence, i))
			y_sentence.append(tags[i])
		X.append(X_sentence)
		y.append(y_sentence)
	return X, y

# Split the data into training and testing sets
split = int(0.8 * len(ud_treebank))
ud_training = ud_treebank[:split]
ud_testing = ud_treebank[:split]
X_ud_train, y_ud_train = transform_to_dataset(ud_training)
X_ud_test, y_ud_test = transform_to_dataset(ud_testing)


print (X_ud_train)


[[{'word': 'cūserā', 'is_first': True, 'is_last': False, 'is_capitalized': False, 'is_all_caps': False, 'is_all_lower': True, 'prefix-1': 'c', 'prefix-2': 'cū', 'prefix-3': 'cūs', 'suffix-1': 'ā', 'suffix-2': 'rā', 'suffix-3': 'erā', 'prev_word': '', 'next_word': 'aṁḍī', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': 'aṁḍī', 'is_first': False, 'is_last': False, 'is_capitalized': False, 'is_all_caps': False, 'is_all_lower': True, 'prefix-1': 'a', 'prefix-2': 'aṁ', 'prefix-3': 'aṁḍ', 'suffix-1': 'ī', 'suffix-2': 'ḍī', 'suffix-3': 'ṁḍī', 'prev_word': 'cūserā', 'next_word': '?', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}, {'word': '?', 'is_first': False, 'is_last': True, 'is_capitalized': True, 'is_all_caps': True, 'is_all_lower': True, 'prefix-1': '?', 'prefix-2': '?', 'prefix-3': '?', 'suffix-1': '?', 'suffix-2': '?', 'suffix-3': '?', 'prev_word': 'aṁḍī', 'next_word': '', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': 

In [30]:
# Train a CRF model on the training data
crf = sklearn_crfsuite.CRF(
	algorithm='lbfgs',
	c1=0.1,
	c2=0.1,
	max_iterations=100,
	all_possible_transitions=True
)
crf.fit(X_ud_train, y_ud_train)

# Make predictions on the test data and evaluate the performance
y_pred = crf.predict(X_ud_test)

print(metrics.flat_accuracy_score(y_ud_test, y_pred))


0.9975621647976597


In [37]:
import pycrfsuite

# Train a CRF model suing pysrfsuite
trainer = pycrfsuite.Trainer(verbose=False)
for x, y in zip(X_ud_train, y_ud_train):
	trainer.append(x, y)
trainer.set_params({
	'c1': 1.0,
	'c2': 1e-3,
	'max_iterations': 50,
	'feature.possible_transitions': True
})
trainer.train('pos.crfsuite')

# Tag a new sentence
tagger = pycrfsuite.Tagger()
tagger.open('pos.crfsuite')
sentence = 'maḷḷī ninnu cūḍagalano.'.split()
sentence1 = 'cūserā èkkaḍiki vèḷtunnāraṁḍī amerika'.split()
features = [word_features(sentence, i) for i in range(len(sentence))]
tags = tagger.tag(features)
print(list(zip(sentence, tags)))

features = [word_features(sentence1, i) for i in range(len(sentence1))]
tags = tagger.tag(features)
print(list(zip(sentence1, tags)))

[('maḷḷī', 'PRON'), ('ninnu', 'PRON'), ('cūḍagalano.', 'VERB')]
[('cūserā', 'VERB'), ('èkkaḍiki', 'NOUN'), ('vèḷtunnāraṁḍī', 'VERB'), ('amerika', 'NOUN')]
