## Imports

In [50]:
import pandas as pd
import nltk
import numpy as np

## Some Function Definitions

In [3]:
def read_text(path):
    with open(path, 'r') as file:
        return file.read()

In [4]:
def write_text(path, text):
    with open(path, 'w') as file:
        file.write(text)

In [92]:
def get_dictionary_from_text(text):
    data = {'token':[], 'pos':[], 'BIO':[]}
    for line in text.split('\n'):
        if line == '':
            data['token'].append('')
            data['pos'].append('')
            data['BIO'].append('')
        else:
            tok, pos, chk =  line.split('\t')
            data['token'].append(tok)
            data['pos'].append(pos)
            data['BIO'].append(chk)
            
    return data

## Loading Training Data

In [5]:
train_text = read_text('../WSJ_CHUNK_FILES/WSJ_02-21.pos-chunk')

In [93]:
train_dict = get_dictionary_from_text(train_text)

In [94]:
train_df = pd.DataFrame.from_dict(train_dict)

In [95]:
train_df.head()

Unnamed: 0,token,pos,BIO
0,,,
1,In,IN,O
2,an,DT,B-NP
3,Oct.,NNP,I-NP
4,19,CD,I-NP


## Testing Stemmer and Lemmatizer

### Stemmer

In [20]:
from nltk.stem.porter import PorterStemmer

In [21]:
porter_stemmer = PorterStemmer()

In [23]:
porter_stemmer.stem('provision')

'provis'

In [29]:
%timeit train_df.token.sample(1000).apply(porter_stemmer.stem)

116 ms ± 5.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Lemmatizer

In [24]:
from nltk.stem import WordNetLemmatizer

In [25]:
wordnet_lemmatizer = WordNetLemmatizer()

In [26]:
wordnet_lemmatizer.lemmatize('dogs')

'dog'

In [27]:
train_df.shape

(1979761, 3)

In [30]:
%timeit train_df.token.sample(1000).apply(wordnet_lemmatizer.lemmatize)

110 ms ± 13.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Create Features

- orig_token
- pos
- lemma
- stem
- prev_token
- prev_pos
- prev_lemma
- prev_stem
- prev_prev_token
- prev_prev_pos
- prev_prev_lemma
- prev_prev_stem
- next_token
- next_pos
- next_lemma
- next_stem
- next_next_token
- next_next_pos
- next_next_lemma
- next_next_stem
- BIO

In [37]:
features_df = train_df.iloc[:,:2]

In [38]:
features_df.shape

(1979761, 2)

In [39]:
features_df.head()

Unnamed: 0,token,pos
0,,
1,In,IN
2,an,DT
3,Oct.,NNP
4,19,CD


In [40]:
features_df['lemma'] = features_df.token.apply(wordnet_lemmatizer.lemmatize)

In [45]:
features_df['stem'] = features_df.token.apply(porter_stemmer.stem)

In [47]:
features_dfs_dfures_df.sample(100)

Unnamed: 0,token,pos,lemma,stem
1369880,the,DT,the,the
1919983,of,IN,of,of
330929,%,NN,%,%
1872342,Patent,NNP,Patent,patent
1807377,assets,NNS,asset,asset
1441269,--,:,--,--
1777161,on,IN,on,on
337566,",",",",",",","
1080379,.,.,.,.
1347682,the,DT,the,the


In [71]:
def get_shifted_column(col, pad=0):
    if pad > 0:
        col = np.concatenate([['']*pad,col[:-pad]])
    elif pad == 0:
        pass
    elif pad < 0:
        col = np.concatenate([col[-pad:],['']*(-pad)])
        
    return col

### Add previous and next token features also

In [79]:
features_df.head(1)

Unnamed: 0,token,pos,lemma,stem
0,,,,


In [82]:
features_df['prev_token'] = get_shifted_column(features_df.token.values, +1)
features_df['prev_pos'] = get_shifted_column(features_df.pos.values, +1)
features_df['prev_lemma'] = get_shifted_column(features_df.lemma.values, +1)
features_df['prev_stem'] = get_shifted_column(features_df.stem.values, +1)

In [84]:
features_df['prev_prev_token'] = get_shifted_column(features_df.token.values, +2)
features_df['prev_prev_pos'] = get_shifted_column(features_df.pos.values, +2)
features_df['prev_prev_lemma'] = get_shifted_column(features_df.lemma.values, +2)
features_df['prev_prev_stem'] = get_shifted_column(features_df.stem.values, +2)

In [87]:
features_df['next_token'] = get_shifted_column(features_df.token.values, -1)
features_df['next_pos'] = get_shifted_column(features_df.pos.values, -1)
features_df['next_lemma'] = get_shifted_column(features_df.lemma.values, -1)
features_df['next_stem'] = get_shifted_column(features_df.stem.values, -1)

In [88]:
features_df['next_next_token'] = get_shifted_column(features_df.token.values, -2)
features_df['next_next_pos'] = get_shifted_column(features_df.pos.values, -2)
features_df['next_next_lemma'] = get_shifted_column(features_df.lemma.values, -2)
features_df['next_next_stem'] = get_shifted_column(features_df.stem.values, -2)

In [96]:
features_df['BIO'] = train_df.BIO.values

In [104]:
features_df.head(3)

Unnamed: 0,token,pos,lemma,stem,prev_token,prev_pos,prev_lemma,prev_stem,prev_prev_token,prev_prev_pos,...,prev_prev_stem,next_token,next_pos,next_lemma,next_stem,next_next_token,next_next_pos,next_next_lemma,next_next_stem,BIO
0,,,,,,,,,,,...,,In,IN,In,In,an,DT,an,an,
1,In,IN,In,In,,,,,,,...,,an,DT,an,an,Oct.,NNP,Oct.,oct.,O
2,an,DT,an,an,In,IN,In,In,,,...,,Oct.,NNP,Oct.,oct.,19,CD,19,19,B-NP


In [105]:
features_df.loc[features_df['prev_token'] == '', :]

Unnamed: 0,token,pos,lemma,stem,prev_token,prev_pos,prev_lemma,prev_stem,prev_prev_token,prev_prev_pos,...,prev_prev_stem,next_token,next_pos,next_lemma,next_stem,next_next_token,next_next_pos,next_next_lemma,next_next_stem,BIO
0,,,,,,,,,,,...,,In,IN,In,In,an,DT,an,an,
1,In,IN,In,In,,,,,,,...,,an,DT,an,an,Oct.,NNP,Oct.,oct.,O
51,Ms.,NNP,Ms.,ms.,,,,,.,.,...,.,Haag,NNP,Haag,haag,plays,VBZ,play,play,B-NP
57,Rolls-Royce,NNP,Rolls-Royce,rolls-royc,,,,,.,.,...,.,Motor,NNP,Motor,motor,Cars,NNPS,Cars,car,B-NP
78,The,DT,The,the,,,,,.,.,...,.,luxury,NN,luxury,luxuri,auto,NN,auto,auto,B-NP
91,Howard,NNP,Howard,howard,,,,,U.S.,NNP,...,u.s.,Mosher,NNP,Mosher,mosher,",",",",",",",",B-NP
121,BELL,NNP,BELL,bell,,,,,.,.,...,.,INDUSTRIES,NNP,INDUSTRIES,industri,Inc.,NNP,Inc.,inc.,B-NP
137,The,DT,The,the,,,,,.,.,...,.,new,JJ,new,new,rate,NN,rate,rate,B-NP
147,A,DT,A,A,,,,,.,.,...,.,record,NN,record,record,date,NN,date,date,B-NP
156,Bell,NNP,Bell,bell,,,,,.,.,...,.,",",",",",",",",based,VBN,based,base,B-NP


In [102]:
features_df.tail(20)

Unnamed: 0,token,pos,lemma,stem,prev_token,prev_pos,prev_lemma,prev_stem,prev_prev_token,prev_prev_pos,...,prev_prev_stem,next_token,next_pos,next_lemma,next_stem,next_next_token,next_next_pos,next_next_lemma,next_next_stem,BIO
1979741,,,,,.,.,.,.,bidders,NNS,...,bidder,That,DT,That,that,could,MD,could,could,
1979742,That,DT,That,that,,,,,.,.,...,.,could,MD,could,could,cost,VB,cost,cost,B-NP
1979743,could,MD,could,could,That,DT,That,that,,,...,,cost,VB,cost,cost,him,PRP,him,him,O
1979744,cost,VB,cost,cost,could,MD,could,could,That,DT,...,that,him,PRP,him,him,the,DT,the,the,O
1979745,him,PRP,him,him,cost,VB,cost,cost,could,MD,...,could,the,DT,the,the,chance,NN,chance,chanc,B-NP
1979746,the,DT,the,the,him,PRP,him,him,cost,VB,...,cost,chance,NN,chance,chanc,to,TO,to,to,B-NP
1979747,chance,NN,chance,chanc,the,DT,the,the,him,PRP,...,him,to,TO,to,to,influence,VB,influence,influenc,I-NP
1979748,to,TO,to,to,chance,NN,chance,chanc,the,DT,...,the,influence,VB,influence,influenc,the,DT,the,the,O
1979749,influence,VB,influence,influenc,to,TO,to,to,chance,NN,...,chanc,the,DT,the,the,outcome,NN,outcome,outcom,O
1979750,the,DT,the,the,influence,VB,influence,influenc,to,TO,...,to,outcome,NN,outcome,outcom,and,CC,and,and,B-NP


## Save Generated Training Data

In [99]:
def save_df_as_text(df, path):
    df.to_csv(path, sep='\t', header=False, index=False)

In [100]:
save_df_as_text(features_df, '../WSJ_CHUNK_FILES/training.feature_1')

In [98]:
features_df.to_csv('../WSJ_CHUNK_FILES/training.feature.csv', sep='\t', header=False, index=False)