## **Importing Library**

In [3]:
#importing all the needed libraries
import pandas as pd       
import nltk
import sklearn
import sklearn_crfsuite
import scipy.stats
import math, string, re
import csv
import json

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.special import softmax
import numpy as np

from sklearn.metrics import classification_report

## **Data Import**

**File paths:**

In [4]:
DATA_PATH = '/content/drive/MyDrive/MscThesis/public_data_PP/gimpel_pos'

train_path = DATA_PATH + '/input_data/train_data.tsv'
dev_path = DATA_PATH + '/input_data/dev_data.tsv'

# Path of dev reference file
dev_ref_path ='/content/drive/MyDrive/MscThesis/dev_reference_labels/Gimpel-POS_answers.jsonlines'

In [5]:
def read_csv_data(path):
  '''
  User define function to read the .csv file from the given path
  '''
  data = []
  with open(path, 'r') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for line in tsvreader:
      data.append(line)
    # to remove the header
  return data[1:]

def load_dictionary(filepath):
    with open(filepath, 'r') as f:
        dictionary = json.load(f)
    return dictionary

def get_dev(path):
  # submissions are in a dictionary format
    hard_lable=[]
    soft_lable=[]
    reference_dictionary = load_dictionary(path)
    for document, doc_contents in reference_dictionary.items():      
        for item_id, contents in doc_contents.items():
            hard_lable.append(contents['gold'])
            soft_lable.append(contents['soft'])

    return hard_lable, soft_lable

In [6]:
# the train and dev data from csv
train_data = read_csv_data(train_path)
train_df = pd.read_csv(train_path, sep='\t')

dev_df = pd.read_csv(dev_path, sep='\t')
h, s = get_dev(dev_ref_path)
dev_df['Hard']=h
dev_df['Soft']=s

In [7]:
train_df.head(5)

Unnamed: 0,Twitt_ID,Token_Id_in_Dataset,Token,Annotations
0,0,0,I,",,,,,,,PRON,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,..."
1,0,1,predict,"VERB,VERB,VERB,VERB,,VERB,VERB,VERB,VERB,NOUN,..."
2,0,2,I,"PRON,PRON,PRON,PRON,,PRON,PRON,PRON,PRON,,PRON..."
3,0,3,won't,",,,,,,,,,,,,,,,,,VERB,,,,,,,,,,,,,,,,,,,,,,,,,..."
4,0,4,win,",,,,,,,,,,,,,,,,,,VERB,,,,,,,,,,,,,,,,,,,,,,,,..."


In [8]:
dev_df.head(5)

Unnamed: 0,Twitt_ID,Token_Id_in_Dataset,Token,Hard,Soft
0,0,0,If,1,"[0.037740984641406065, 0.2788702527442295, 0.0..."
1,0,1,you,7,"[0.0062730078597622975, 0.0062730078597622975,..."
2,0,2,can,10,"[0.006273007859762292, 0.006273007859762292, 0..."
3,0,3,see,10,"[0.014855213984839731, 0.014855213984839731, 0..."
4,0,4,only,0,"[0.11545587797864408, 0.04247384387074182, 0.3..."


### **Data Processing:**


#### **Generating Hard label and Soft Distribution for training data**


In [9]:
def get_soft(string_format):
    annotator_dict = {ann_idx:labels_dict[annotation] for ann_idx, annotation in enumerate(string_format.split(',')) if annotation != ""}
    ann_labs = list(annotator_dict.values())
    distr = [ann_labs.count(l) for l in range(len(labels_dict))]
    return distr, softmax(distr).tolist()

labels_dict = {'ADJ': 0, 'ADP': 1, 'ADV': 2, 'CCONJ': 3, 'DET': 4,'NOUN': 5, 'NUM': 6, 'PART': 8,'PRON': 7,'PUNCT': 9,'VERB': 10, 'X': 11}

train_softs = []
train_distr = []
for line in train_data:
    if line:
        distr, soft = get_soft(line[-1])
        train_softs.append(soft)
        train_distr.append(distr)

#appending soft distribution to the training df
train_df['Distr']= train_distr
train_df['Soft']= train_softs
train_df['Hard_label']=''

In [11]:
#converting numerical label to text label
for i in range(len(train_df)):
  train_df['Hard_label'].iloc[i] = list(labels_dict.keys())[np.argmax(train_df['Soft'].iloc[i])]
  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### **Training Dataframe after processing:**

In [12]:
train_df.head(5)

Unnamed: 0,Twitt_ID,Token_Id_in_Dataset,Token,Annotations,Distr,Soft,Hard_label
0,0,0,I,",,,,,,,PRON,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...","[0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0]","[0.0062730078597622975, 0.0062730078597622975,...",PART
1,0,1,predict,"VERB,VERB,VERB,VERB,,VERB,VERB,VERB,VERB,NOUN,...","[2, 1, 0, 0, 0, 4, 0, 0, 0, 0, 134, 0]","[4.7111658015535965e-58, 1.733141042341547e-58...",VERB
2,0,2,I,"PRON,PRON,PRON,PRON,,PRON,PRON,PRON,PRON,,PRON...","[0, 0, 0, 0, 1, 9, 0, 118, 1, 1, 0, 0]","[5.665668176358939e-52, 5.665668176358939e-52,...",PART
3,0,3,won't,",,,,,,,,,,,,,,,,,VERB,,,,,,,,,,,,,,,,,,,,,,,,,...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0]","[0.014855213984839731, 0.014855213984839731, 0...",VERB
4,0,4,win,",,,,,,,,,,,,,,,,,,VERB,,,,,,,,,,,,,,,,,,,,,,,,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0]","[0.006273007859762292, 0.006273007859762292, 0...",VERB


#### **Dev Data Processing**

In [13]:
dev_df['Hard_label']=''
#converting numerical lable to text label
for i in range(len(dev_df)):
  dev_df['Hard_label'].iloc[i] = list(labels_dict.keys())[dev_df['Hard'].iloc[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


####**Dev Dataframe after processing**

In [107]:
dev_df.head(5)

Unnamed: 0,Twitt_ID,Token_Id_in_Dataset,Token,Hard,Soft,Hard_lable
0,0,0,If,1,"[0.037740984641406065, 0.2788702527442295, 0.0...",ADP
1,0,1,you,7,"[0.0062730078597622975, 0.0062730078597622975,...",PART
2,0,2,can,10,"[0.006273007859762292, 0.006273007859762292, 0...",VERB
3,0,3,see,10,"[0.014855213984839731, 0.014855213984839731, 0...",VERB
4,0,4,only,0,"[0.11545587797864408, 0.04247384387074182, 0.3...",ADJ


### Feature Generation :

In [18]:
def format_data(data):
  """
  Input: dataframe
  Function: Concatenate the token and token label as a pair
  Output: return the list containing token pair
  """
  flag, start= 0,0
  end=0
  sentences=[]
  for i in range(len(data)):
    if data['Twitt_ID'][i]!= flag:
      end= i-1
      txt = list(data['Token'][start:end])
      txt_lable= list(data['Hard_label'][start:end])

      sent=[]
      for j in range(len(txt)):
        sent.append([txt[j], txt_lable[j]])      
      sentences.append(sent)
      start=i
      flag=flag+1

  return sentences

In [15]:
def word2features(sent, i):
    word = sent[i][0]
    # State feature: current token
    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.lower()': word.lower(),
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word.lower()),
        'word.ispunctuation': (word in string.punctuation),
        'word.isdigit()': word.isdigit(),
    }

    # Transition Feature: token one position before
    if i > 0:
      word1 = sent[i-1][0]
      features.update({          
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.lower()': word1.lower(),
            '-1:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word1.lower()),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.ispunctuation': (word1 in string.punctuation),})
          
    else:
      features['BOS'] = True #beginning of sentence feature

    # Transition Feature: token two position before
    if i > 1:
        word2 = sent[i-2][0]
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.lower()': word2.lower(),
            '-2:word[:3]': word2[:3],
            '-2:word[:2]': word2[:2],
            '-2:word[-3:]': word2[-3:],
            '-2:word[-2:]': word2[-2:],
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.ispunctuation': (word2 in string.punctuation),})

    # Transition Feature: token one position ahead
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.lower()': word1.lower(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.ispunctuation': (word1 in string.punctuation),
        })

    else:
        features['EOS'] = True # end of sentence feature

    # Transition Feature: token two position ahead
    if i < len(sent) - 2:
      word2 = sent[i+2][0]
      features.update({
              '+2:word': word2,
              '+2:len(word)': len(word2),
              '+2:word.lower()': word2.lower(),
              '+2:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word2.lower()),
              '+2:word[:3]': word2[:3],
              '+2:word[:2]': word2[:2],
              '+2:word[-3:]': word2[-3:],
              '+2:word[-2:]': word2[-2:],
              '+2:word.isdigit()': word2.isdigit(),
              '+2:word.ispunctuation': (word2 in string.punctuation),
          })

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]


In [19]:
#formatting the data into sentences
train_sents = format_data(train_df)
dev_sents = format_data(dev_df)

In [26]:
train_sents[1]

[['RT', 'X'],
 ['@USER', 'NOUN'],
 [':', 'PUNCT'],
 ['wat', 'PRON'],
 ['muhfuckaz', 'NOUN'],
 ['wearin', 'VERB'],
 ['4', 'NUM'],
 ['the', 'DET'],
 ['lingerie', 'ADJ'],
 ['party', 'NOUN']]

In [20]:
print('Train data: ', len(train_sents), '| Dev Data', len(dev_sents))

Train data:  599 | Dev Data 199


In [21]:
#extracting features from all the sentences
Xtrain = [sent2features(s) for s in train_sents]
ytrain = [sent2labels(s) for s in train_sents]

Xtest = [sent2features(s) for s in dev_sents]
ytest = [sent2labels(s) for s in dev_sents]

## **CRF Training** 

### **Using Hard Labels**

In [159]:
c1=[0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
c2= [0, 0.1, 0.15, 0.2,0.25, 0.3, 0.35, 0.4]

In [162]:
print("training tagger...")
for i in c1:
  for j in c2:
    crf = sklearn_crfsuite.CRF(algorithm = 'lbfgs',
                               c1 = i,
                               c2 = j,
                               max_iterations = 100,
                               all_possible_transitions=True)    
    
    try:
      crf.fit(Xtrain, ytrain)
    except AttributeError:
      pass

    ypred = crf.predict(Xtrain)
    #obtaining metrics such as accuracy, etc. on the train set
    labels = list(crf.classes_)
    print('C1: ', i, '| C2: ', j)
    print('F1 score train set = {}'.format(metrics.flat_f1_score(ytrain, ypred, average='weighted', labels=labels)))
    print('Accuracy  train set = {}\n'.format(metrics.flat_accuracy_score(ytrain, ypred)))
    
    ypred_d = crf.predict(Xtest)
    print('F1 score test set = {}'.format(metrics.flat_f1_score(ytest, ypred_d,average='weighted', labels=labels)))
    print('Accuracy on test set = {}'.format(metrics.flat_accuracy_score(ytest, ypred_d)))

    print('----------------------------------')
    

training tagger...
C1:  0 | C2:  0
F1 score train set = 0.995591457610168
Accuracy  train set = 0.9955872809863725

F1 score test set = 0.7333302657889865
Accuracy on test set = 0.747693399574166
----------------------------------
C1:  0 | C2:  0.1
F1 score train set = 0.9902597247754735
Accuracy  train set = 0.9902660609993511

F1 score test set = 0.7385793576696729
Accuracy on test set = 0.7562100780695529
----------------------------------
C1:  0 | C2:  0.15
F1 score train set = 0.992721763223498
Accuracy  train set = 0.9927319922128488

F1 score test set = 0.7341394003554922
Accuracy on test set = 0.7526614620298083
----------------------------------
C1:  0 | C2:  0.2
F1 score train set = 0.9880482727874919
Accuracy  train set = 0.9880597014925373

F1 score test set = 0.7368323640157265
Accuracy on test set = 0.752306600425834
----------------------------------
C1:  0 | C2:  0.25
F1 score train set = 0.9933723837582755
Accuracy  train set = 0.9933809214795587

F1 score test set = 0

#### Traing on best hyperparameter

In [22]:
crf = sklearn_crfsuite.CRF(algorithm = 'lbfgs',
                               c1 = 0.25,
                               c2 = 0.1,
                               max_iterations = 100,
                               all_possible_transitions=True)    
    
try:
  crf.fit(Xtrain, ytrain)
except AttributeError:
  pass

#### **Training Evaluation**

In [23]:
#obtaining metrics such as accuracy, etc. on the train set
labels = list(crf.classes_)
#labels.remove('X')

ypred = crf.predict(Xtrain)
print('F1 score on the train set = {}\n'.format(metrics.flat_f1_score(ytrain, ypred, average='weighted', labels=labels)))
print('Accuracy on the train set = {}\n'.format(metrics.flat_accuracy_score(ytrain, ypred)))

print('----------------------------------')
yt, yp=[], []
for i in range(len(ytrain)):
  yt.extend(ytrain[i])
  yp.extend(ypred[i])

print(classification_report(yt, yp))

F1 score on the train set = 0.9994807602610388

Accuracy on the train set = 0.9994808565866321

----------------------------------
              precision    recall  f1-score   support

         ADJ       1.00      1.00      1.00       561
         ADP       1.00      1.00      1.00       787
         ADV       1.00      1.00      1.00       239
       CCONJ       1.00      1.00      1.00       138
         DET       1.00      1.00      1.00       281
        NOUN       1.00      1.00      1.00      2057
         NUM       1.00      1.00      1.00        40
        PART       1.00      1.00      1.00       830
        PRON       1.00      1.00      1.00       295
       PUNCT       1.00      1.00      1.00       893
        VERB       1.00      1.00      1.00      1263
           X       1.00      1.00      1.00       321

    accuracy                           1.00      7705
   macro avg       1.00      1.00      1.00      7705
weighted avg       1.00      1.00      1.00      7705



#### **Devset Evaluation**

In [24]:
#obtaining metrics such as accuracy, etc. on the test set
ypred_d = crf.predict(Xtest)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(ytest, ypred_d,
                      average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(ytest, ypred_d)))

print('----------------------------------')
yt, yp=[], []
for i in range(len(ytest)):
  yt.extend(ytest[i])
  yp.extend(ypred_d[i])

print(classification_report(yt, yp))

F1 score on the test set = 0.7479061747621223

Accuracy on the test set = 0.7636621717530163

----------------------------------
              precision    recall  f1-score   support

         ADJ       0.54      0.54      0.54       165
         ADP       0.76      0.87      0.81       238
         ADV       0.83      0.40      0.54       150
       CCONJ       0.81      0.85      0.83        52
         DET       1.00      0.53      0.69       195
        NOUN       0.77      0.89      0.82       748
         NUM       0.60      0.26      0.36        47
        PART       0.64      0.93      0.76       211
        PRON       0.66      0.36      0.46       138
       PUNCT       0.79      0.98      0.87       278
        VERB       0.85      0.88      0.86       452
           X       0.81      0.40      0.53       144

    accuracy                           0.76      2818
   macro avg       0.76      0.66      0.67      2818
weighted avg       0.77      0.76      0.75      2818



#### **Transation Summary**

In [155]:
#obtaining the most likely and the least likely transitions 
from collections import Counter

def print_transitions(transition_features):
    for (label_from, label_to), weight in transition_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top 10 likely transitions - \n")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop 10 unlikely transitions - \n")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top 10 likely transitions - 

X      -> X       1.249807
PRON   -> PUNCT   1.141814
ADJ    -> NOUN    0.946512
PART   -> VERB    0.943096
ADP    -> DET     0.752923
CCONJ  -> PART    0.733016
NOUN   -> CCONJ   0.671694
VERB   -> DET     0.644489
ADV    -> VERB    0.634640
VERB   -> ADV     0.628565

Top 10 unlikely transitions - 

ADP    -> X       -0.577970
ADJ    -> X       -0.596109
NOUN   -> ADJ     -0.611641
ADJ    -> PART    -0.635384
PUNCT  -> ADJ     -0.645086
PART   -> X       -0.646575
X      -> ADP     -0.770626
ADV    -> X       -0.772202
NOUN   -> DET     -0.928222
VERB   -> X       -1.084295


### **Using Soft Distribution**

In [27]:
def format_data_soft(data):
  """
  Input: dataframe
  Function: Concatenate the token and token label as a pair
  Output: return the list containing token pair
  """
  flag, start= 0,0
  end=0
  sentences=[]
  for i in range(len(data)):
    if data['Twitt_ID'][i]!= flag:
      end= i-1
      txt = list(data['Token'][start:end])
      txt_label= list(data['Hard_label'][start:end])
      soft_label= list(data['Soft'][start:end])

      sent=[]
      for j in range(len(txt)):
        sent.append([txt[j], txt_label[j], soft_label[j]])      
      sentences.append(sent)
      start=i
      flag=flag+1

  return sentences

In [30]:
#formatting the data into sentences
train_sents = format_data_soft(train_df)
dev_sents = format_data_soft(dev_df)

In [49]:
def word2features_soft(sent, i):
    word = sent[i][0]
    distr = np.array(sent[i][2]) #soft_label distribution

    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.lower()': word.lower(),
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word.lower()),
        'word.ispunctuation': (word in string.punctuation),
        'word.isdigit()': word.isdigit(),
        'var': np.var(distr), #added variance of soft label of word in feature
        'std': np.std(distr)  #added st.dev of soft label of word in feature
    }

    if i > 0:
      word1 = sent[i-1][0]
      distr1 = np.array(sent[i-1][2])
      features.update({          
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.lower()': word1.lower(),
            '-1:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word1.lower()),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.ispunctuation': (word1 in string.punctuation),
            '-1:var': np.var(distr1), #added variance of soft label of word in feature
            '-1:std': np.std(distr1), #added st.dev of soft label of word in feature         
            })
      
          
    else:
      features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        distr2 = np.array(sent[i-2][2])
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.lower()': word2.lower(),
            '-2:word[:3]': word2[:3],
            '-2:word[:2]': word2[:2],
            '-2:word[-3:]': word2[-3:],
            '-2:word[-2:]': word2[-2:],
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.ispunctuation': (word2 in string.punctuation),
            '-2:var': np.var(distr2), #added variance of soft label of word in feature
            '-2:std': np.std(distr2), #added st.dev of soft label of word in feature
            })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        distr1 = np.array(sent[i+1][2])
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.lower()': word1.lower(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.ispunctuation': (word1 in string.punctuation),
            '+1:var': np.var(distr1), #added variance of soft label of word in feature
            '+1:std': np.std(distr1), #added st.dev of soft label of word in feature
        })

    else:
        features['EOS'] = True

    if i < len(sent) - 2:
      word2 = sent[i+2][0]
      distr2 = np.array(sent[i+2][2])
      features.update({
              '+2:word': word2,
              '+2:len(word)': len(word2),
              '+2:word.lower()': word2.lower(),
              '+2:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word2.lower()),
              '+2:word[:3]': word2[:3],
              '+2:word[:2]': word2[:2],
              '+2:word[-3:]': word2[-3:],
              '+2:word[-2:]': word2[-2:],
              '+2:word.isdigit()': word2.isdigit(),
              '+2:word.ispunctuation': (word2 in string.punctuation),
              '+2:var': np.var(distr2), #added variance of soft label of word in feature
              '+2:std': np.std(distr2), #added st.dev of soft label of word in feature
          })

    return features

def sent2features(sent):
    return [word2features_soft(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]


In [50]:
#extracting features from all the sentences
Xtrain = [sent2features(s) for s in train_sents]
ytrain = [sent2labels(s) for s in train_sents]

Xtest = [sent2features(s) for s in dev_sents]
ytest = [sent2labels(s) for s in dev_sents]

In [53]:
c1=[0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
c2= [0, 0.1, 0.15, 0.2,0.25, 0.3, 0.35, 0.4]

In [54]:
print("training tagger...")
for i in c1:
  for j in c2:
    crf = sklearn_crfsuite.CRF(algorithm = 'lbfgs',
                               c1 = i,
                               c2 = j,
                               max_iterations = 100,
                               all_possible_transitions=True)    
    
    try:
      crf.fit(Xtrain, ytrain)
    except AttributeError:
      pass

    ypred = crf.predict(Xtrain)
    #obtaining metrics such as accuracy, etc. on the train set
    labels = list(crf.classes_)
    print('C1: ', i, '| C2: ', j)
    print('F1 score train set = {}'.format(metrics.flat_f1_score(ytrain, ypred, average='weighted', labels=labels)))
    print('Accuracy  train set = {}\n'.format(metrics.flat_accuracy_score(ytrain, ypred)))
    
    ypred_d = crf.predict(Xtest)
    print('F1 score test set = {}'.format(metrics.flat_f1_score(ytest, ypred_d,average='weighted', labels=labels)))
    print('Accuracy on test set = {}'.format(metrics.flat_accuracy_score(ytest, ypred_d)))

    print('----------------------------------')
    

training tagger...
C1:  0 | C2:  0
F1 score train set = 0.9940237668613786
Accuracy  train set = 0.9940298507462687

F1 score test set = 0.7350477287440113
Accuracy on test set = 0.7512420156139106
----------------------------------
C1:  0 | C2:  0.1
F1 score train set = 0.9920822673238652
Accuracy  train set = 0.9920830629461389

F1 score test set = 0.7362883189987486
Accuracy on test set = 0.7544357700496807
----------------------------------
C1:  0 | C2:  0.15
F1 score train set = 0.991561789113483
Accuracy  train set = 0.9915639195327709

F1 score test set = 0.7340524912565379
Accuracy on test set = 0.748403122782115
----------------------------------
C1:  0 | C2:  0.2
F1 score train set = 0.9937685294264134
Accuracy  train set = 0.9937702790395847

F1 score test set = 0.7351534808587241
Accuracy on test set = 0.7505322924059616
----------------------------------
C1:  0 | C2:  0.25
F1 score train set = 0.9950622994368954
Accuracy  train set = 0.9950681375730045

F1 score test set =

In [51]:
crf = sklearn_crfsuite.CRF(algorithm = 'lbfgs',
                               c1 = 0.25,
                               c2 = 0.1,
                               max_iterations = 100,
                               all_possible_transitions=True)    
    
try:
  crf.fit(Xtrain, ytrain)
except AttributeError:
  pass

In [47]:
#obtaining metrics such as accuracy, etc. on the train set
labels = list(crf.classes_)
#labels.remove('X')

ypred = crf.predict(Xtrain)
print('F1 score on the train set = {}\n'.format(metrics.flat_f1_score(ytrain, ypred, average='weighted', labels=labels)))
print('Accuracy on the train set = {}\n'.format(metrics.flat_accuracy_score(ytrain, ypred)))

print('----------------------------------')
yt, yp=[], []
for i in range(len(ytrain)):
  yt.extend(ytrain[i])
  yp.extend(ypred[i])

print(classification_report(yt, yp))

F1 score on the train set = 0.9996105782198832

Accuracy on the train set = 0.999610642439974

----------------------------------
              precision    recall  f1-score   support

         ADJ       1.00      1.00      1.00       561
         ADP       1.00      1.00      1.00       787
         ADV       1.00      1.00      1.00       239
       CCONJ       1.00      1.00      1.00       138
         DET       1.00      1.00      1.00       281
        NOUN       1.00      1.00      1.00      2057
         NUM       1.00      1.00      1.00        40
        PART       1.00      1.00      1.00       830
        PRON       1.00      1.00      1.00       295
       PUNCT       1.00      1.00      1.00       893
        VERB       1.00      1.00      1.00      1263
           X       1.00      1.00      1.00       321

    accuracy                           1.00      7705
   macro avg       1.00      1.00      1.00      7705
weighted avg       1.00      1.00      1.00      7705



In [52]:
#obtaining metrics such as accuracy, etc. on the test set
ypred_d = crf.predict(Xtest)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(ytest, ypred_d,
                      average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(ytest, ypred_d)))

print('----------------------------------')
yt, yp=[], []
for i in range(len(ytest)):
  yt.extend(ytest[i])
  yp.extend(ypred_d[i])

print(classification_report(yt, yp))

F1 score on the test set = 0.7419833425834459

Accuracy on the test set = 0.7572746628814763

----------------------------------
              precision    recall  f1-score   support

         ADJ       0.50      0.53      0.51       165
         ADP       0.76      0.86      0.81       238
         ADV       0.81      0.39      0.52       150
       CCONJ       0.81      0.83      0.82        52
         DET       0.99      0.53      0.69       195
        NOUN       0.76      0.88      0.82       748
         NUM       0.60      0.26      0.36        47
        PART       0.65      0.94      0.77       211
        PRON       0.62      0.36      0.45       138
       PUNCT       0.79      0.98      0.87       278
        VERB       0.84      0.87      0.85       452
           X       0.83      0.40      0.54       144

    accuracy                           0.76      2818
   macro avg       0.75      0.65      0.67      2818
weighted avg       0.77      0.76      0.74      2818



## **References**
1.   scikit-learn. 2022. User guide: contents. [online] Available at: <https://scikit-learn.org/stable/user_guide.html> [Accessed 21 August 2022].

2.   Sklearn-crfsuite.readthedocs.io. 2022. sklearn-crfsuite — sklearn-crfsuite 0.3 documentation. [online] Available at: <https://sklearn-crfsuite.readthedocs.io/en/latest/> [Accessed 21 August 2022].




## **Installation**

In [1]:
!pip install sklearn_crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 28.3 MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6


In [2]:
!pip install scikit-learn==0.22.2 --user

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.22.2
  Downloading scikit_learn-0.22.2-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 24.3 MB/s 
Installing collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.22.2 which is incompatible.
imbalanced-learn 0.8.1 requires scikit-learn>=0.24, but you have scikit-learn 0.22.2 which is incompatible.[0m
Successfully installed scikit-learn-0.22.2
