<a href="https://colab.research.google.com/github/a-Imantha/simple-nlp/blob/main/Pos_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# POS Tagging



## Download Dataset

In [None]:
!pip install gdown



In [None]:
import gdown
gdown.download('https://drive.google.com/uc?id=1ALo3_vX1ci2lZpq_bDVHvK6MiAHphjVT','zip_files.zip', quiet=False)
!unzip '/content/zip_files.zip'

Downloading...
From: https://drive.google.com/uc?id=DATASET_LOCATION_GDRIVE
To: /content/zip_files.zip
100%|██████████| 4.49k/4.49k [00:00<00:00, 4.55MB/s]

Archive:  /content/zip_files.zip
  inflating: FILE_NAME.txt   





## Import and Download Required packages

In [None]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download(['punkt','averaged_perceptron_tagger','universal_tagset','brown'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

## Processing the Evaluation Data

In [None]:
text = sent_tokenize(open("/content/feedback_cs2012_1.txt","r").read())
text = [word_tokenize(i) for i in text]

## Preparing the Training Data Corpus

In [None]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories=['news','reviews'],tagset='universal')

## POS Tagging

### POS TAGGER 1: Unigram Tagger

In [None]:
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger_out = unigram_tagger.tag_sents(text)

### POS TAGGER 2: Bigram Tagger

In [None]:
bigram_tagger = nltk.BigramTagger(brown_tagged_sents)
bigram_tagger_out = bigram_tagger.tag_sents(text)

### POS TAGGER 3: Bigram Tagger with Unigram tagger as backoff

In [None]:
bigram_tagger_comp = nltk.BigramTagger(brown_tagged_sents, backoff=unigram_tagger)
bigram_tagger_comp_out = bigram_tagger_comp.tag_sents(text)

### POS TAGGER 4: Regexp Tagger

In [None]:
patterns = [
            (r'.*ing$', 'VERB'),                # gerunds
            (r'.*ed$', 'VERB'),                 # simple past
            (r'.*es$', 'VERB'),                 # 3rd singular present
            (r'.*ould$', 'VERB'),                # modals
            (r'.*\'s$', 'NOUN'),                # possessive nouns
            (r'.*s$', 'NOUN'),                  # plural nouns
            (r'^-?[0-9]+(\.[0-9]+)?$', 'NUM'),  # cardinal numbers
            (r'.*', 'NOUN')                      # nouns (default)
          ]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger_out = regexp_tagger.tag_sents(text)

## Writing output to Files

In [None]:
def write_to_file(tagger_output, tagger_name):  
  taggedfile = open(tagger_name+"_output.txt" , "w")
  taggedfile_csv = open(tagger_name+"_output.csv" , "w")
  for line in tagger_output:
    line_out = ""
    for sent in line:
      line_out = line_out + str(sent[0]) + "/" + str(sent[1]) + " "
      taggedfile_csv.write(str(sent[0]) + "," + str(sent[1]) + "\n")
    taggedfile.write(line_out + "\n")
  taggedfile.close ()
  taggedfile_csv.close()

In [None]:
write_to_file(unigram_tagger_out,"unigram_tagger")
write_to_file(bigram_tagger_out,"bigram_tagger")
write_to_file(bigram_tagger_comp_out,"bigram_tagger_comp")
write_to_file(regexp_tagger_out,"regexp_tagger")