In [132]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [133]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [134]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [135]:
def split_data(X,y,testset_size=0.3):
    """Split train-test data""" 
    return train_test_split(X, y, test_size=testset_size, random_state=0)

In [136]:
def train_model(X_train, y_train):
    # Create CRF model
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )

    # Fit CRF model
    crf.fit(X_train, y_train)
    
    return crf

In [137]:
def evaluate_performance(y_test, y_pred,avg='weighted'):
    """evaluate model performance"""
    accuracy=metrics.flat_f1_score(y_test, y_pred, average=avg)
    precison=metrics.flat_precision_score(y_test, y_pred, average=avg)
    recall=metrics.flat_recall_score(y_test, y_pred, average=avg)
    f1score=metrics.flat_f1_score(y_test, y_pred, average=avg)
    return {"accuracy":accuracy, "precison":precison, "recall":recall, "f1score":f1score}

In [138]:
# Load dataset
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df.head() 

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [139]:
# Get Sentence
getter = SentenceGetter(df)
sentences = getter.sentences

# get features and labels
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [140]:
len(X)

47959

In [141]:
# Split data
X_train, X_test, y_train, y_test = split_data(X,y)

In [142]:
crf=train_model(X_train, y_train)

In [143]:
# Predict the labels
y_pred = crf.predict(X_test)

# evaluate the performance
evaluate_performance(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.9353594409301975,
 'precison': 0.9365881697278384,
 'recall': 0.9373088685015291,
 'f1score': 0.9353594409301975}

In [127]:
X_features = [(word2features(s, i)) for i in range(len(s)) for s in sentences]
y = [s for s in sent2labels(sent) for sent in sentences]

In [94]:
df_features=pd.DataFrame(X_features)
X_features_list=df_features.values.tolist()

In [103]:
X = df.drop('Tag', axis=1)
dv = DictVectorizer(sparse=False)
X = dv.fit_transform(X[:100].to_dict('records'))

In [122]:
df_features.

Unnamed: 0,bias,word.lower(),word[-3:],word[-2:],word.isupper(),word.istitle(),word.isdigit(),postag,postag[:2],BOS,EOS
0,1.0,thousands,nds,ds,False,True,False,NNS,NN,True,True
1,1.0,iranian,ian,an,False,True,False,JJ,JJ,True,True
2,1.0,helicopter,ter,er,False,True,False,NN,NN,True,True
3,1.0,they,hey,ey,False,True,False,PRP,PR,True,True
4,1.0,u.n.,.N.,N.,True,True,False,NNP,NN,True,True
...,...,...,...,...,...,...,...,...,...,...,...
47954,1.0,opposition,ion,on,False,True,False,NNP,NN,True,True
47955,1.0,on,On,On,False,True,False,IN,IN,True,True
47956,1.0,following,ing,ng,False,True,False,VBG,VB,True,True
47957,1.0,since,nce,ce,False,True,False,IN,IN,True,True


## Perceptron

In [144]:
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split


df = df.fillna(method='ffill')
df1=df[:10000]
X = df1.drop('Tag', axis=1)
y = df1.Tag.values
classes=df1.Tag.unique()

dv = DictVectorizer(sparse=False)
X = dv.fit_transform(X.to_dict('records'))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

y_pred=per.predict(X_test)

# evaluate the performance
evaluate_performance(y_test, y_pred)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s


-- Epoch 1-- Epoch 1

-- Epoch 1
-- Epoch 1-- Epoch 1
-- Epoch 1

-- Epoch 1
-- Epoch 1
Norm: 23.56, NNZs: 362, Bias: -3.000000, T: 7000, Avg. loss: 0.043429
Total training time: 0.14 seconds.
-- Epoch 1Norm: 8.83, NNZs: 69, Bias: -4.000000, T: 7000, Avg. loss: 0.004714
Total training time: 0.13 seconds.

-- Epoch 1
Norm: 5.83, NNZs: 31, Bias: -2.000000, T: 7000, Avg. loss: 0.001857
Total training time: 0.16 seconds.
-- Epoch 1
Norm: 5.29, NNZs: 20, Bias: -2.000000, T: 7000, Avg. loss: 0.001143
Total training time: 0.15 seconds.
-- Epoch 1
Norm: 19.77, NNZs: 257, Bias: -5.000000, T: 7000, Avg. loss: 0.030000
Total training time: 0.17 seconds.
-- Epoch 1
Norm: 26.57, NNZs: 384, Bias: -4.000000, T: 7000, Avg. loss: 0.041571
Total training time: 0.16 seconds.
-- Epoch 1
Norm: 18.41, NNZs: 231, Bias: -3.000000, T: 7000, Avg. loss: 0.024000
Total training time: 0.16 seconds.
-- Epoch 1

[Parallel(n_jobs=-1)]: Done   4 out of  17 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   6 out of  17 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   8 out of  17 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  17 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  17 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:    0.2s finished



Norm: 17.35, NNZs: 173, Bias: -3.000000, T: 7000, Avg. loss: 0.011857
Total training time: 0.17 seconds.
-- Epoch 1
Norm: 7.42, NNZs: 49, Bias: -3.000000, T: 7000, Avg. loss: 0.002571
Total training time: 0.10 seconds.
-- Epoch 1
Norm: 6.16, NNZs: 32, Bias: -2.000000, T: 7000, Avg. loss: 0.001571
Total training time: 0.11 seconds.
Norm: 8.77, NNZs: 74, Bias: -3.000000, T: 7000, Avg. loss: 0.006143
Norm: 21.63, NNZs: 320, Bias: -4.000000, T: 7000, Avg. loss: 0.032857
Total training time: 0.10 seconds.
Total training time: 0.13 seconds.
Norm: 18.38, NNZs: 232, Bias: -4.000000, T: 7000, Avg. loss: 0.026143
Total training time: 0.10 seconds.
Norm: 7.94, NNZs: 52, Bias: -3.000000, T: 7000, Avg. loss: 0.003714
Total training time: 0.13 seconds.
Norm: 3.16, NNZs: 10, Bias: -2.000000, T: 7000, Avg. loss: 0.000143
Total training time: 0.11 seconds.
Norm: 6.08, NNZs: 37, Bias: -3.000000, T: 7000, Avg. loss: 0.003286
Total training time: 0.09 seconds.
Norm: 24.21, NNZs: 352, Bias: 2.000000, T: 7

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.41489255682524484,
 'precison': 0.4251889733367007,
 'recall': 0.417436974789916,
 'f1score': 0.41489255682524484}

In [146]:
df1.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
9995,Sentence: 457,war,NN,O
9996,Sentence: 457,crimes,NNS,O
9997,Sentence: 457,tribunal,NN,O
9998,Sentence: 457,in,IN,O
9999,Sentence: 457,The,DT,O


## SGDClassifier

In [119]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(X_train, y_train)

y_pred=per.predict(X_test)

# evaluate the performance
evaluate_performance(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.41489255682524484,
 'precison': 0.4251889733367007,
 'recall': 0.417436974789916,
 'f1score': 0.41489255682524484}

## PassiveAggressiveClassifier

In [130]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier

pa =PassiveAggressiveClassifier()
pa.fit(X_train, y_train)

y_pred=per.predict(X_test)

# evaluate the performance
evaluate_performance(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.41489255682524484,
 'precison': 0.4251889733367007,
 'recall': 0.417436974789916,
 'f1score': 0.41489255682524484}

## MultinomialNB

In [121]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=0.01)
nb.fit(X_train, y_train)


y_pred=per.predict(X_test)

# evaluate the performance
evaluate_performance(y_test, y_pred)


  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.41489255682524484,
 'precison': 0.4251889733367007,
 'recall': 0.417436974789916,
 'f1score': 0.41489255682524484}