In [1]:
# NAME ENTITY RECOGNITION (TASK-1)

In [2]:
!mkdir -p CONLL2003
!wget -nc -O CONLL2003/train.txt https://github.com/davidsbatista/NER-datasets/raw/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt
!wget -nc -O CONLL2003/valid.txt https://github.com/davidsbatista/NER-datasets/raw/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/valid.txt

--2023-11-10 09:40:44--  https://github.com/davidsbatista/NER-datasets/raw/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt [following]
--2023-11-10 09:40:45--  https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3283418 (3.1M) [text/plain]
Saving to: ‘CONLL2003/train.txt’


2023-11-10 09:40:46 (199 MB/s) - ‘CONLL2003/train.txt’ s

In [3]:
# this is how you read a file of this kind
# one item per line, empty lines between sequences

from collections import namedtuple

#Same as tuple but the fields are named for convenience
#this says we have four fields
OneWord=namedtuple("OneWord",["word","pos_label","chunk_label","entity_label"])

def read_conll2003(f_name):
    """Yield complete sentences"""
    current_sentence=[] #This will be a list of (word,label), which we accumulate for each sentence
    with open(f_name) as f:
        for line in f:
            line=line.strip() #drop whitespace
            if line.startswith("-DOCSTART-"): #let's not worry about these for the time being
                continue
            if not line: #sentence break
                if current_sentence: #if we gathered a sentence, we should yield it, because a new one starts
                    yield current_sentence #much like return, but continues past this line once the element has been consumed
                    current_sentence=[] #...and start a new one
                continue
            #if we made it here, we are on a normal line
            columns=line.split() #an actual word line
            assert len(columns)==4 #we should have four columns, looking at the data
            current_sentence.append(OneWord(*columns)) #* expands columns as arguments to OneWord constructor
        else: #for ... else -> the else part is executed once, when "for" runs out of elements
            if current_sentence: #yield also the last one!
                yield current_sentence

#Now just read the data in
sentences_train=list(read_conll2003("CONLL2003/train.txt"))
sentences_dev=list(read_conll2003("CONLL2003/valid.txt"))

print("First three sentences")
for sent in sentences_dev[:3]:
    print(sent)
    print()

First three sentences
[OneWord(word='CRICKET', pos_label='NNP', chunk_label='B-NP', entity_label='O'), OneWord(word='-', pos_label=':', chunk_label='O', entity_label='O'), OneWord(word='LEICESTERSHIRE', pos_label='NNP', chunk_label='B-NP', entity_label='B-ORG'), OneWord(word='TAKE', pos_label='NNP', chunk_label='I-NP', entity_label='O'), OneWord(word='OVER', pos_label='IN', chunk_label='B-PP', entity_label='O'), OneWord(word='AT', pos_label='NNP', chunk_label='B-NP', entity_label='O'), OneWord(word='TOP', pos_label='NNP', chunk_label='I-NP', entity_label='O'), OneWord(word='AFTER', pos_label='NNP', chunk_label='I-NP', entity_label='O'), OneWord(word='INNINGS', pos_label='NNP', chunk_label='I-NP', entity_label='O'), OneWord(word='VICTORY', pos_label='NN', chunk_label='I-NP', entity_label='O'), OneWord(word='.', pos_label='.', chunk_label='O', entity_label='O')]

[OneWord(word='LONDON', pos_label='NNP', chunk_label='B-NP', entity_label='B-LOC'), OneWord(word='1996-08-30', pos_label='CD',

In [4]:
def generate_sentence_features(sent):
    #Given a sentence as a list of (word, label) pairs
    #generate the features for every word
    #The result should be a list of same length as the sentence
    #Each item is a dictionary of {"feature name"->feature value} mappings, holding all features of the word at that position

    sent_features=[] #this will be the result
    for one_word in sent:
        #We must do nothing with label
        #it just happens to be around
        word_features={}
        word_features["word_"+one_word.word]=1 #the word itself is a feature
        sent_features.append(word_features)
    return sent_features

print(generate_sentence_features(sentences_dev[0])  )

[{'word_CRICKET': 1}, {'word_-': 1}, {'word_LEICESTERSHIRE': 1}, {'word_TAKE': 1}, {'word_OVER': 1}, {'word_AT': 1}, {'word_TOP': 1}, {'word_AFTER': 1}, {'word_INNINGS': 1}, {'word_VICTORY': 1}, {'word_.': 1}]


In [5]:
#...now we can generate the training examples
def prep_data(sentences):
    all_labels=[] #here we gather labels for all words in all sentences
    all_features=[] #here we gather features for all words in all sentences
    for sentence in sentences:
        sent_features=generate_sentence_features(sentence)
        assert len(sent_features)==len(sentence)
        #Now we can get, for every position its label and its features
        for one_word,features in zip(sentence,sent_features):
            all_labels.append(one_word.pos_label) #label
            all_features.append(features)         #and features to go with it
    return all_labels, all_features

train_labels,train_features=prep_data(sentences_train)
dev_labels,dev_features=prep_data(sentences_dev)

In [6]:
from sklearn.feature_extraction import DictVectorizer
vectorizer=DictVectorizer()
vectorizer.fit(train_features)
print("Vectorizer vocab size:",len(vectorizer.vocabulary_))

feature_vectors_train=vectorizer.transform(train_features)
feature_vectors_dev=vectorizer.transform(dev_features)

print("Train shape",feature_vectors_train.shape)
print("Dev shape",feature_vectors_dev.shape)

Vectorizer vocab size: 23623
Train shape (203621, 23623)
Dev shape (51362, 23623)


In [7]:
import sklearn.svm

classifier=sklearn.svm.LinearSVC(C=0.05,verbose=1)
classifier.fit(feature_vectors_train, train_labels)

[LibLinear]

In [8]:
classifier.score(feature_vectors_dev,dev_labels)

0.8655426190568903

In [9]:
def generate_sentence_features(sent):
    #Given a sentence as a list of (word, label) pairs
    #generate the features for every word
    #The result should be a list of same length as the sentence
    #Each item is a dictionary of {"feature name"->feature value} mappings, holding all features of the word at that position

    sent_features=[] #this will be the result
    for word_idx, one_word in enumerate(sent):
        #We do nothing with label
        #it just happens to be around
        word_features={}
        word_features["word_"+one_word.word]=1 #the word itself is a feature
        if word_idx!=0:
            word_features["left_word_"+sent[word_idx-1].word]=1
        if word_idx!=len(sent)-1:
            word_features["right_word_"+sent[word_idx+1].word]=1
        sent_features.append(word_features)
    return sent_features

train_labels,train_features=prep_data(sentences_train)
dev_labels,dev_features=prep_data(sentences_dev)
vectorizer=DictVectorizer()
vectorizer.fit(train_features)
feature_vectors_train=vectorizer.transform(train_features)
feature_vectors_dev=vectorizer.transform(dev_features)

print("Train shape",feature_vectors_train.shape)
print("Dev shape",feature_vectors_dev.shape)

classifier=sklearn.svm.LinearSVC(C=1,verbose=1)
classifier.fit(feature_vectors_train, train_labels)
classifier.score(feature_vectors_dev,dev_labels)

Train shape (203621, 68467)
Dev shape (51362, 68467)
[LibLinear]

0.9292862427475566

In [10]:
# Let us try to look at some predictions
sentence="I can house arrest you in my house .".split()

sentence_data=[OneWord(w,"XXX","XXX","XXX") for w in sentence] #we need to fake this a bit, to get data in the correct format
_,sentence_features=prep_data([sentence_data])
sentence_vectors=vectorizer.transform(sentence_features)
predictions=classifier.predict(sentence_vectors)
for word,label in zip(sentence,predictions):
    print(word,label)


I PRP
can MD
house VB
arrest NN
you PRP
in IN
my PRP$
house NN
. .


In [11]:
print("Learned coefficients:",classifier.coef_.shape)
print("Classes in the data:",classifier.classes_)


Learned coefficients: (45, 68467)
Classes in the data: ['"' '$' "''" '(' ')' ',' '.' ':' 'CC' 'CD' 'DT' 'EX' 'FW' 'IN' 'JJ' 'JJR'
 'JJS' 'LS' 'MD' 'NN' 'NNP' 'NNPS' 'NNS' 'NN|SYM' 'PDT' 'POS' 'PRP' 'PRP$'
 'RB' 'RBR' 'RBS' 'RP' 'SYM' 'TO' 'UH' 'VB' 'VBD' 'VBG' 'VBN' 'VBP' 'VBZ'
 'WDT' 'WP' 'WP$' 'WRB']


In [12]:
import numpy

#Reverse the dictionary
index2feature={}
for feature,idx in vectorizer.vocabulary_.items():
    assert idx not in index2feature #This really should hold
    index2feature[idx]=feature
#Now we can query index2feature to get the feature names as we need

i=list(classifier.classes_).index("NN") #which of the coefficients corresponds to nouns?
indices=numpy.argsort(classifier.coef_[i])
print("Negative features")
for idx in indices[:30]:
    print(index2feature[idx])
print("-------------------------------")
print("Positive features")
for idx in indices[::-1][:30]: #you can also do it the other way round, reverse, then pick
    print(index2feature[idx])

Negative features
left_word_will
left_word_Sale
word_,
left_word_going
left_word_could
left_word_would
left_word_goals
right_word_A-rated
left_word_We
word_and
left_word_At
word_in
left_word_still
right_word_announcement
left_word_mixer
left_word_can
left_word_I
left_word_should
left_word_kms
left_word_might
left_word_8:00
left_word_prices
left_word_Mike
right_word_SCOREBOARD
left_word_must
left_word_n't
left_word_overs
right_word_effect
left_word_Services
word_two
-------------------------------
Positive features
word_world
word_power
word_consumer
word_peace
word_number
word_hospital
word_vouch
word_cricket
word_procure
word_soccer
word_victory
word_championship
word_staff
word_motor
word_value
word_cabinet
word_lunch
word_rain
word_injury
word_league
word_anyone
word_UNION
word_weekend
word_edge
word_parliament
word_shutdown
word_division
word_cash
word_tournament
word_race


In [14]:
#NER using spacy

!pip install -U spacy

Collecting spacy
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
Collecting weasel<0.4.0,>=0.1.0 (from spacy)
  Downloading weasel-0.3.4-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting cloudpathlib<0.17.0,>=0.7.0 (from weasel<0.4.0,>=0.1.0->spacy)
  Downloading cloudpathlib-0.16.0-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cloudpathlib, weasel, spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.6.1
    Uninstalling spacy-3.6.1:
      Successfully uninstalled spacy-3.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages t

In [15]:

from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy
from tqdm import tqdm

In [16]:
import spacy
from spacy import displacy

NER = spacy.load("en_core_web_sm")



In [17]:
raw_text="The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."

In [18]:
text1= NER(raw_text)


In [19]:
for word in text1.ents:
    print(word.text,word.label_)

The Indian Space Research Organisation ORG
India GPE
Bengaluru GPE
Department of Space ORG
India GPE
ISRO ORG
DOS ORG


In [20]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [21]:
spacy.explain("GPE")

'Countries, cities, states'

In [22]:
displacy.render(text1,style="ent",jupyter=True)

In [23]:
raw_text2="India is famous for the Taj Mahal.India really is multilingual.India invented yoga.India is the birthplace of Ayurveda."

In [24]:
text2= NER(raw_text2)

In [None]:
for word in text2.ents:
    print(word.text,word.label_)

In [26]:
displacy.render(text2,style="ent",jupyter=True)

In [None]:
# PARTS OF SPEECH TAGGING (TASK--2)



#This cell loads the Penn Treebank corpus from nltk into a list variable named penn_treebank.

#No need to install nltk in google colab since it is preloaded in the environments.
#!pip install nltk
import nltk

#Ensure that the treebank corpus is downloaded
nltk.download('treebank')

#Load the treebank corpus class
from nltk.corpus import treebank

#Now we iterate over all samples from the corpus (the fileids - that are equivalent to sentences)
#and retrieve the word and the pre-labeled PoS tag. This will be added as a list of tuples with
#a list of words and a list of their respective PoS tags (in the same order).
penn_treebank = []
for fileid in treebank.fileids():
  tokens = []
  tags = []
  for word, tag in treebank.tagged_words(fileid):
    tokens.append(word)
    tags.append(tag)
  penn_treebank.append((tokens, tags))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [None]:
#This cell loads the Universal Dependecies Treekbank corpus. It'll download all the packages, but we'll only use the GUM
#english package. We'll also install the conllu package, that was developed to parse data in the conLLu format, a
#format common of linguistic annotated files. We'll also have a list variable, but now named ud_treebank.

#Install conllu package, download the UD Treebanks corpus and unpack it.
!pip install conllu
!wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz
!tar zxf ud-treebanks-v2.5.tgz

#The imports needed to open and parse (interpret) the conllu file. At the end we'll have a list of dicts.
from io import open
from conllu import parse_incr

#Open the file and load the sentences to a list.
data_file = open("ud-treebanks-v2.5/UD_Telugu-MTG/te_mtg-ud-train.conllu", "r", encoding="utf-8")
ud_files = []
for tokenlist in parse_incr(data_file):
    ud_files.append(tokenlist)

#Now we iterate over all samples from the corpus and retrieve the word and the pre-labeled PoS tag (upostag). This will
#be added as a list of tuples with a list of words and a list of their respective PoS tags (in the same order).
ud_treebank = []
for sentence in ud_files:
  tokens = []
  tags = []
  for token in sentence:
    tokens.append(token['form'])
    tags.append(token['upostag'])
  ud_treebank.append((tokens, tags))

--2023-11-10 08:46:41--  https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz
Resolving lindat.mff.cuni.cz (lindat.mff.cuni.cz)... 195.113.20.140
Connecting to lindat.mff.cuni.cz (lindat.mff.cuni.cz)|195.113.20.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 355216681 (339M) [application/x-gzip]
Saving to: ‘ud-treebanks-v2.5.tgz.1’


2023-11-10 08:47:09 (12.5 MB/s) - ‘ud-treebanks-v2.5.tgz.1’ saved [355216681/355216681]



In [None]:
#Regex module for checking alphanumeric values.
import re
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
  }

In [None]:
#Ater defining the extract_features, we define a simple function to transform our data in a more 'datasetish' format.
#This function returns the data as two lists, one of Dicts of features and the other with the labels.
def transform_to_dataset(tagged_sentences):
  X, y = [], []
  for sentence, tags in tagged_sentences:
    sent_word_features, sent_tags = [],[]
    for index in range(len(sentence)):
        sent_word_features.append(extract_features(sentence, index)),
        sent_tags.append(tags[index])
    X.append(sent_word_features)
    y.append(sent_tags)
  return X, y

#We divide the set BEFORE encoding. Why? To have full sentences in training/testing sets. When we encode, we do not encode
#a sentence, but its words instead.

#First, for the Penn treebank.
penn_train_size = int(0.8*len(penn_treebank))
penn_training = penn_treebank[:penn_train_size]
penn_testing = penn_treebank[penn_train_size:]
X_penn_train, y_penn_train = transform_to_dataset(penn_training)
X_penn_test, y_penn_test = transform_to_dataset(penn_testing)

#Then, for UD Treebank.
ud_train_size = int(0.8*len(ud_treebank))
ud_training = ud_treebank[:ud_train_size]
ud_testing = ud_treebank[ud_train_size:]
X_ud_train, y_ud_train = transform_to_dataset(ud_training)
X_ud_test, y_ud_test = transform_to_dataset(ud_testing)

#Third step, vectorize datasets. For that we use sklearn DictVectorizer
#WARNING

In [None]:
#Ignoring some warnings for the sake of readability.
import warnings
warnings.filterwarnings('ignore')

#First, install sklearn_crfsuite, as it is not preloaded into Colab.
!pip install sklearn_crfsuite
from sklearn_crfsuite import CRF

#This loads the model. Specifics are:
#algorithm: methodology used to check if results are improving. Default is lbfgs (gradient descent).
#c1 and c2:  coefficients used for regularization.
#max_iterations: max number of iterations (DUH!)
#all_possible_transitions: since crf creates a "network", of probability transition states,
#this option allows it to map even "connections" not present in the data.
penn_crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
#The fit method is the default name used by Machine Learning algorithms to start training.
print("Started training on Penn Treebank corpus!")
penn_crf.fit(X_penn_train, y_penn_train)
print("Finished training on Penn Treebank corpus!")

#Same for UD
ud_crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
print("Started training on UD corpus!")
ud_crf.fit(X_ud_train, y_ud_train)
print("Finished training on UD corpus!")

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn_crfsuite-0.3.6
Started training on Penn Treebank corpus!
Finished training on Penn Treebank corpus!
Started training on UD corpus!
Finished training on UD corpus!


In [None]:
#We'll use the sklearn_crfsuit own metrics to compute f1 score.
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

print("## Penn ##")

#First calculate a prediction from test data, then we print the metrics for f-1 using the .flat_f1_score method.
y_penn_pred=penn_crf.predict(X_penn_test)
print("F1 score on Test Data")
print(metrics.flat_f1_score(y_penn_test, y_penn_pred,average='weighted',labels=penn_crf.classes_))
#For the sake of clarification, we do the same for train data.
y_penn_pred_train=penn_crf.predict(X_penn_train)
print("F1 score on Training Data ")
print(metrics.flat_f1_score(y_penn_train, y_penn_pred_train,average='weighted',labels=penn_crf.classes_))



# This presents class wise score. Helps see which classes (tags) are the ones with most problems.
print("Class wise score:")





## Penn ##
F1 score on Test Data
0.9668646324625245
F1 score on Training Data 
0.9936643188628935
Class wise score:


In [None]:
print(metrics.flat_classification_report(y_penn_test, y_penn_pred, labels=penn_crf.classes_,digits=3))



#Same for UD
print("## UD ##")

y_ud_pred=ud_crf.predict(X_ud_test)
print("F1 score on Test Data ")
print(metrics.flat_f1_score(y_ud_test, y_ud_pred,average='weighted',labels=ud_crf.classes_))
y_ud_pred_train=ud_crf.predict(X_ud_train)
print("F1 score on Training Data ")
print(metrics.flat_f1_score(y_ud_train, y_ud_pred_train,average='weighted',labels=ud_crf.classes_))

### Look at class wise score
print("Class wise score:")
print(metrics.flat_classification_report(
    y_ud_test, y_ud_pred, labels=ud_crf.classes_, digits=3
))


In [None]:
#First, we pass the sentence and "quickly tokenize it" - we've already done it in our code, so I'll just mock here with a split:
sent = "తెలుగు అనేది ద్రావిడ భాషల కుటుంబానికి చెందిన భాష. దీనిని మాట్లాడే ప్రజలు ప్రధానంగా ఆంధ్ర, తెలంగాణాలో ఉన్నారు. ఇది ఆ రాష్ట్రాలలో అధికార భాష"
features = [extract_features(sent.split(), idx) for idx in range(len(sent.split()))]

#Then we tell the algorithm to make a prediction on a single input (sentence). I'll do once for Penn Treebank and once for UD.
penn_results = penn_crf.predict_single(features)
ud_results = ud_crf.predict_single(features)

#These line magics are just there to make it a neaty print, making a (word, POS) style print;
penn_tups = [(sent.split()[idx], penn_results[idx]) for idx in range(len(sent.split()))]
ud_tups = [(sent.split()[idx], ud_results[idx]) for idx in range(len(sent.split()))]

#The results come out here! Notice the difference in tags.
print(penn_tups)
print(ud_tups)

[('తెలుగు', 'CD'), ('అనేది', 'CD'), ('ద్రావిడ', 'CD'), ('భాషల', 'CD'), ('కుటుంబానికి', 'CD'), ('చెందిన', 'CD'), ('భాష.', 'CD'), ('దీనిని', 'CD'), ('మాట్లాడే', 'CD'), ('ప్రజలు', 'CD'), ('ప్రధానంగా', 'CD'), ('ఆంధ్ర,', 'CD'), ('తెలంగాణాలో', 'CD'), ('ఉన్నారు.', 'CD'), ('ఇది', 'CD'), ('ఆ', 'CD'), ('రాష్ట్రాలలో', 'CD'), ('అధికార', 'CD'), ('భాష', 'CD')]
[('తెలుగు', 'PROPN'), ('అనేది', 'VERB'), ('ద్రావిడ', 'ADP'), ('భాషల', 'NOUN'), ('కుటుంబానికి', 'NOUN'), ('చెందిన', 'VERB'), ('భాష.', 'NOUN'), ('దీనిని', 'NOUN'), ('మాట్లాడే', 'VERB'), ('ప్రజలు', 'NOUN'), ('ప్రధానంగా', 'ADV'), ('ఆంధ్ర,', 'NOUN'), ('తెలంగాణాలో', 'NOUN'), ('ఉన్నారు.', 'VERB'), ('ఇది', 'PRON'), ('ఆ', 'DET'), ('రాష్ట్రాలలో', 'NOUN'), ('అధికార', 'VERB'), ('భాష', 'PUNCT')]


In [None]:
#import the pickle module
import pickle

#Simply dump! Use 'wb' in open to write bytes.

penn_filename = 'penn_treebank_crf_postagger.sav'
pickle.dump(penn_crf, open(penn_filename, 'wb'))

ud_filename = 'ud_crf_postagger.sav'
pickle.dump(ud_crf, open(ud_filename,'wb'))