In [38]:
import pandas as pd, numpy as np
from itertools import chain
from sklearn.model_selection import train_test_split

In [41]:
# Data preparation
data = pd.read_csv('D:/AI Projects/NLP-Projects/Name Entity Recognition/Dataset/ner_dataset.csv', encoding= 'unicode_escape')
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


The **Sentence#**, **Word** and **POS**([Part of Speech](https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/)) columns will represent the **feature X**, and the **Tag** column will represent right **label Y**

In [36]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    elif token_or_tag == 'tag':
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)

#Replace the NaN value in Sentence# with related Sentence number
data_fillna = data.fillna(method='ffill', axis=0)

data_fillna

# Groupby and collect columns
data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

data_group['String'] = data_group['Sentence #'].apply(lambda x: x[:10])
data_group['Integer'] = data_group['Sentence #'].apply(lambda x: int(x[10:]))
data_group = data_group.sort_values(by=['String', 'Integer'])
data_group

  data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx,String,Integer
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[26523, 28117, 3907, 1689, 4854, 29174, 940, 2...","[11, 11, 11, 11, 11, 11, 8, 11, 11, 11, 11, 11...",Sentence:,1
11111,Sentence: 2,"[Families, of, soldiers, killed, in, the, conf...","[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[35172, 28117, 1327, 12213, 25292, 22705, 9596...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...",Sentence:,2
22222,Sentence: 3,"[They, marched, from, the, Houses, of, Parliam...","[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo...","[18967, 4854, 29783, 22705, 24844, 28117, 5445...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8...",Sentence:,3
33333,Sentence: 4,"[Police, put, the, number, of, marchers, at, 1...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[31962, 19543, 22705, 5032, 28117, 27113, 2627...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...",Sentence:,4
42404,Sentence: 5,"[The, protest, comes, on, the, eve, of, the, a...","[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,...","[9626, 29005, 10637, 15063, 22705, 34031, 2811...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8...",Sentence:,5
...,...,...,...,...,...,...,...,...
42173,Sentence: 47955,"[Indian, border, security, forces, are, accusi...","[JJ, NN, NN, NNS, VBP, VBG, PRP$, JJ, NNS, IN,...","[B-gpe, O, O, O, O, O, O, B-gpe, O, O, O, O, O...","[34718, 15980, 1039, 31025, 12067, 3242, 17563...","[12, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 1...",Sentence:,47955
42174,Sentence: 47956,"[Indian, officials, said, no, one, was, injure...","[JJ, NNS, VBD, DT, NN, VBD, VBN, IN, NNP, POS,...","[B-gpe, O, O, O, O, O, O, O, B-tim, O, O, O, O...","[34718, 29279, 4367, 2856, 29865, 3927, 18460,...","[12, 11, 11, 11, 11, 11, 11, 11, 15, 11, 11, 1...",Sentence:,47956
42175,Sentence: 47957,"[Two, more, landed, in, fields, belonging, to,...","[CD, JJR, VBD, IN, NNS, VBG, TO, DT, JJ, NN, .]","[O, O, O, O, O, O, O, O, O, O, O]","[17979, 33767, 6910, 25292, 639, 32995, 29056,...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]",Sentence:,47957
42176,Sentence: 47958,"[They, say, not, all, of, the, rockets, explod...","[PRP, VBP, RB, DT, IN, DT, NNS, VBD, IN, NN, .]","[O, O, O, O, O, O, O, O, O, O, O]","[18967, 9806, 33459, 30711, 28117, 22705, 1311...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]",Sentence:,47958


In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [6]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]