**Importing necessary modules and studying the dataset.**



In [2]:
!pip install nltk
!pip install conllu

Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3


In [3]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize


In [4]:
nltk.download('conll2000')
nltk.download('universal_tagset')

[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [5]:
conlldata = list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))

In [6]:
print(conlldata[5:10])

[[('``', '.'), ('If', 'ADP'), ('there', 'DET'), ('is', 'VERB'), ('another', 'DET'), ('bad', 'ADJ'), ('trade', 'NOUN'), ('number', 'NOUN'), (',', '.'), ('there', 'DET'), ('could', 'VERB'), ('be', 'VERB'), ('an', 'DET'), ('awful', 'ADJ'), ('lot', 'NOUN'), ('of', 'ADP'), ('pressure', 'NOUN'), (',', '.'), ("''", '.'), ('noted', 'VERB'), ('Simon', 'NOUN'), ('Briscoe', 'NOUN'), (',', '.'), ('U.K.', 'NOUN'), ('economist', 'NOUN'), ('for', 'ADP'), ('Midland', 'NOUN'), ('Montagu', 'NOUN'), (',', '.'), ('a', 'DET'), ('unit', 'NOUN'), ('of', 'ADP'), ('Midland', 'NOUN'), ('Bank', 'NOUN'), ('PLC', 'NOUN'), ('.', '.')], [('Forecasts', 'NOUN'), ('for', 'ADP'), ('the', 'DET'), ('trade', 'NOUN'), ('figures', 'NOUN'), ('range', 'VERB'), ('widely', 'ADV'), (',', '.'), ('but', 'CONJ'), ('few', 'ADJ'), ('economists', 'NOUN'), ('expect', 'VERB'), ('the', 'DET'), ('data', 'NOUN'), ('to', 'PRT'), ('show', 'VERB'), ('a', 'DET'), ('very', 'ADV'), ('marked', 'VERB'), ('improvement', 'NOUN'), ('from', 'ADP'), ('t

**Feature Function: Defining features for a sentence for extraction.**

In [7]:
def feature_function(sentence, i):
  word = sentence[i][0]
  pos = sentence[i][1]
  features = {
      'word': word,
      'first_word': i == 0,
      'last_word': i == len(sentence) - 1,
      'prev_word': sentence[i-1][0],
      'prev_pos': sentence[i-1][1],
      'capitalized': word[1:].lower() != word[1:],
      'prefix_1': word[:1],
      'prefix_2': word[:2],
      'prefix_3': word[:3],
      'suffix_1': word[-1:],
      'suffix_2': word[-2:],
      'suffix_3': word[-3:]
  }
  return features

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
data_train, data_test = train_test_split(conlldata, train_size=0.75,test_size=0.25)
data_train, dev_set = train_test_split(conlldata, train_size=0.70, test_size=0.30)

In [10]:
print(data_train[0][0:5])
feature_function(data_train[0],0)

[('A', 'DET'), ('Japan', 'NOUN'), ('Air', 'NOUN'), ('Lines', 'NOUN'), ('spokesman', 'NOUN')]


{'word': 'A',
 'first_word': True,
 'last_word': False,
 'prev_word': '.',
 'prev_pos': '.',
 'capitalized': False,
 'prefix_1': 'A',
 'prefix_2': 'A',
 'prefix_3': 'A',
 'suffix_1': 'A',
 'suffix_2': 'A',
 'suffix_3': 'A'}

In [11]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn-crfsuite-0.3.6


In [12]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

**Implementation of CRF**

In [13]:
class LinearChainCRF:
  # Assigning feature weights here
  def __init__(self):
    self.weights = None

  # Defining feature function here
  def feature_function(self, x, y_prev, y):
    return features

  # Computing transition score from y_prev to y given the token x
  def transition(self, x, y_prev, y):
    features = self.feature_function(x, y_prev, y)
    return np.exp(np.dot(features, self.weights))

  # Calculate marginal probabilites
  # Also be used in prediction and parameter estimation
  def forward_backward(self, x):
    return forward_probs, backward_probs

  # Finding the most likely sequence of labels
  def viterbi(self, x):
    return predicted_sequence

  # Objective functin that will be used by optimizer
  def gradient_optimizer(self, weights, *args):
    return log_likelihood, grad

  #Train the model
  def fit(self, X_train, y_train):
    global crf
    crf = sklearn_crfsuite.CRF(
        algorithm = 'lbfgs',
        c1 = 0.1,
        c2 = 0.1,
        max_iterations = 100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

  def predict(self, X_test):
    return crf.predict(X_test)

**Usage**

In [14]:
linCRF = LinearChainCRF()

X = []
y = []
for sentence in conlldata:
  X_sentence = []
  y_sentence = []
  for i in range(len(sentence)):
    X_sentence.append(feature_function(sentence, i))
    y_sentence.append(sentence[i][1])
  X.append(X_sentence)
  y.append(y_sentence)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

linCRF.fit(X_train, y_train)


In [15]:
y_pred = linCRF.predict(X_test)

In [16]:
print(metrics.flat_accuracy_score(y_test, y_pred))


0.9781790704129785
