## Imports

In [50]:
import pandas as pd
import nltk
import numpy as np

## Some Function Definitions

In [3]:
def read_text(path):
    with open(path, 'r') as file:
        return file.read()

In [4]:
def write_text(path, text):
    with open(path, 'w') as file:
        file.write(text)

In [92]:
def get_dictionary_from_text(text):
    data = {'token':[], 'pos':[], 'BIO':[]}
    for line in text.split('\n'):
        if line == '':
            data['token'].append('')
            data['pos'].append('')
            data['BIO'].append('')
        else:
            tok, pos, chk =  line.split('\t')
            data['token'].append(tok)
            data['pos'].append(pos)
            data['BIO'].append(chk)
            
    return data

## Loading Training Data

In [5]:
train_text = read_text('../WSJ_CHUNK_FILES/WSJ_02-21.pos-chunk')

In [93]:
train_dict = get_dictionary_from_text(train_text)

In [94]:
train_df = pd.DataFrame.from_dict(train_dict)

In [95]:
train_df.head()

Unnamed: 0,token,pos,BIO
0,,,
1,In,IN,O
2,an,DT,B-NP
3,Oct.,NNP,I-NP
4,19,CD,I-NP


## Testing Stemmer and Lemmatizer

### Stemmer

In [20]:
from nltk.stem.porter import PorterStemmer

In [21]:
porter_stemmer = PorterStemmer()

In [23]:
porter_stemmer.stem('provision')

'provis'

In [29]:
%timeit train_df.token.sample(1000).apply(porter_stemmer.stem)

116 ms ± 5.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Lemmatizer

In [24]:
from nltk.stem import WordNetLemmatizer

In [25]:
wordnet_lemmatizer = WordNetLemmatizer()

In [26]:
wordnet_lemmatizer.lemmatize('dogs')

'dog'

In [27]:
train_df.shape

(1979761, 3)

In [30]:
%timeit train_df.token.sample(1000).apply(wordnet_lemmatizer.lemmatize)

110 ms ± 13.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Create Features

- orig_token
- pos
- lemma
- stem
- prev_token
- prev_pos
- prev_lemma
- prev_stem
- prev_prev_token
- prev_prev_pos
- prev_prev_lemma
- prev_prev_stem
- next_token
- next_pos
- next_lemma
- next_stem
- next_next_token
- next_next_pos
- next_next_lemma
- next_next_stem
- BIO

In [37]:
features_df = train_df.iloc[:,:2]

In [38]:
features_df.shape

(1979761, 2)

In [39]:
features_df.head()

Unnamed: 0,token,pos
0,,
1,In,IN
2,an,DT
3,Oct.,NNP
4,19,CD


In [40]:
features_df['lemma'] = features_df.token.apply(wordnet_lemmatizer.lemmatize)

In [45]:
features_df['stem'] = features_df.token.apply(porter_stemmer.stem)

In [47]:
features_dfs_dfures_df.sample(100)

Unnamed: 0,token,pos,lemma,stem
1369880,the,DT,the,the
1919983,of,IN,of,of
330929,%,NN,%,%
1872342,Patent,NNP,Patent,patent
1807377,assets,NNS,asset,asset
1441269,--,:,--,--
1777161,on,IN,on,on
337566,",",",",",",","
1080379,.,.,.,.
1347682,the,DT,the,the


In [71]:
def get_shifted_column(col, pad=0):
    if pad > 0:
        col = np.concatenate([['']*pad,col[:-pad]])
    elif pad == 0:
        pass
    elif pad < 0:
        col = np.concatenate([col[-pad:],['']*(-pad)])
        
    return col

### Add previous and next token features also

In [79]:
features_df.head(1)

Unnamed: 0,token,pos,lemma,stem
0,,,,


In [82]:
features_df['prev_token'] = get_shifted_column(features_df.token.values, +1)
features_df['prev_pos'] = get_shifted_column(features_df.pos.values, +1)
features_df['prev_lemma'] = get_shifted_column(features_df.lemma.values, +1)
features_df['prev_stem'] = get_shifted_column(features_df.stem.values, +1)

In [84]:
features_df['prev_prev_token'] = get_shifted_column(features_df.token.values, +2)
features_df['prev_prev_pos'] = get_shifted_column(features_df.pos.values, +2)
features_df['prev_prev_lemma'] = get_shifted_column(features_df.lemma.values, +2)
features_df['prev_prev_stem'] = get_shifted_column(features_df.stem.values, +2)

In [87]:
features_df['next_token'] = get_shifted_column(features_df.token.values, -1)
features_df['next_pos'] = get_shifted_column(features_df.pos.values, -1)
features_df['next_lemma'] = get_shifted_column(features_df.lemma.values, -1)
features_df['next_stem'] = get_shifted_column(features_df.stem.values, -1)

In [88]:
features_df['next_next_token'] = get_shifted_column(features_df.token.values, -2)
features_df['next_next_pos'] = get_shifted_column(features_df.pos.values, -2)
features_df['next_next_lemma'] = get_shifted_column(features_df.lemma.values, -2)
features_df['next_next_stem'] = get_shifted_column(features_df.stem.values, -2)

In [96]:
features_df['BIO'] = train_df.BIO.values

In [97]:
features_df.head()

Unnamed: 0,token,pos,lemma,stem,prev_token,prev_pos,prev_lemma,prev_stem,prev_prev_token,prev_prev_pos,...,prev_prev_stem,next_token,next_pos,next_lemma,next_stem,next_next_token,next_next_pos,next_next_lemma,next_next_stem,BIO
0,,,,,,,,,,,...,,In,IN,In,In,an,DT,an,an,
1,In,IN,In,In,,,,,,,...,,an,DT,an,an,Oct.,NNP,Oct.,oct.,O
2,an,DT,an,an,In,IN,In,In,,,...,,Oct.,NNP,Oct.,oct.,19,CD,19,19,B-NP
3,Oct.,NNP,Oct.,oct.,an,DT,an,an,In,IN,...,In,19,CD,19,19,review,NN,review,review,I-NP
4,19,CD,19,19,Oct.,NNP,Oct.,oct.,an,DT,...,an,review,NN,review,review,of,IN,of,of,I-NP


In [None]:
# Save Generated Training Data