In [56]:
import pandas as pd
import numpy as np
import string
import gensim
import os
os.chdir(r'C:\Users\anaverageone\htlt_env\Machine_Learning')

In [38]:
def extract_features_and_labels(inputfile):
    """
    This function extracts features and labels from the original data and return it to an output of dataframe format.
    :param inputfile: the path to the original data file
    :type inputfile: string
    :return df: a dataframe contains all extracted features and labels from the original data
    """
    # Read inputfile as pandas dataframe
    df = pd.read_csv(inputfile, sep='\t', header=None, 
                     names=['Token', 'POS', 'Chunk', 'GOLD'])
    df.dropna(inplace=True)
    
    # Feature 1. Token
    tokens = df['Token'].tolist()
    
     # Feature 2. POS tag
    pos = df['POS'].tolist()
    
    # Feature 3. Chunk
    chunk = df['Chunk'].tolist()

    # Feature 4. Capitalization
    capitalization = []
    for token in tokens:
        capitalization.append(token.isupper())
    df['Cap'] = capitalization
    
    # Feature 5. contain digital text
    digit = []
    for token in tokens:
        digit.append(token.isdigit())
    df['Digit'] = digit
    
    # Feature 6. Preceding Token
    prev_token = []
    for i in range(len(tokens)):
        prev_index = (i-1)
        if prev_index < 0:
            previous_token = "."
        else:
            previous_token = tokens[prev_index]
        prev_token.append(previous_token)
    df['Preceding_Token'] = prev_token
    
    # Feature 7. Proceding Token
    proc_token = []
    for i in range(len(tokens)-1):
        proc_index = (i+1)
        proceed_token = tokens[proc_index]
        proc_token.append(proceed_token)
    proc_token.append('.')
    df['Proceeding_Token'] = proc_token
    df = df[['Token', 'POS', 'Chunk', 'Cap','Digit', 'Preceding_Token', 'Proceeding_Token', 'GOLD']]
    
    return df

In [40]:
# Preprocess the original training set and save as the corresponding conll file
train_data = 'data/conll2003.train.conll'
train = extract_features_and_labels(train_data)
train.to_csv('data/preprocessed_train_set.conll', sep='\t', index=False)

train  # Example of the first 10 lines of the preprocess training set

Unnamed: 0,Token,POS,Chunk,Cap,Digit,Preceding_Token,Proceeding_Token,GOLD
0,EU,NNP,B-NP,True,False,.,rejects,B-ORG
1,rejects,VBZ,B-VP,False,False,EU,German,O
2,German,JJ,B-NP,False,False,rejects,call,B-MISC
3,call,NN,I-NP,False,False,German,to,O
4,to,TO,B-VP,False,False,call,boycott,O
...,...,...,...,...,...,...,...,...
203616,three,CD,I-NP,False,False,Division,Swansea,O
203617,Swansea,NN,B-NP,False,False,three,1,B-ORG
203618,1,CD,I-NP,False,True,Swansea,Lincoln,O
203619,Lincoln,NNP,I-NP,False,False,1,2,B-ORG


In [41]:
# Preprocess the original development set and save as the corresponding conll file
dev_data = 'data/conll2003.dev.conll'
dev = extract_features_and_labels(dev_data)
dev.to_csv('data/preprocessed_dev_set.conll', sep='\t', index=False)
dev[:10]

Unnamed: 0,Token,POS,Chunk,Cap,Digit,Preceding_Token,Proceeding_Token,GOLD
0,CRICKET,NNP,B-NP,True,False,.,-,O
1,-,:,O,False,False,CRICKET,LEICESTERSHIRE,O
2,LEICESTERSHIRE,NNP,B-NP,True,False,-,TAKE,B-ORG
3,TAKE,NNP,I-NP,True,False,LEICESTERSHIRE,OVER,O
4,OVER,IN,B-PP,True,False,TAKE,AT,O
5,AT,NNP,B-NP,True,False,OVER,TOP,O
6,TOP,NNP,I-NP,True,False,AT,AFTER,O
7,AFTER,NNP,I-NP,True,False,TOP,INNINGS,O
8,INNINGS,NNP,I-NP,True,False,AFTER,VICTORY,O
9,VICTORY,NN,I-NP,True,False,INNINGS,.,O


In [42]:
# Preprocess the original test set and save as the corresponding conll file
test_data = 'data/conll2003.test.conll'
test = extract_features_and_labels(test_data)
test.to_csv('data/preprocessed_test_set.conll', sep='\t', index=False)
test[:10]

Unnamed: 0,Token,POS,Chunk,Cap,Digit,Preceding_Token,Proceeding_Token,GOLD
0,SOCCER,NN,B-NP,True,False,.,-,O
1,-,:,O,False,False,SOCCER,JAPAN,O
2,JAPAN,NNP,B-NP,True,False,-,GET,B-LOC
3,GET,VB,B-VP,True,False,JAPAN,LUCKY,O
4,LUCKY,NNP,B-NP,True,False,GET,WIN,O
5,WIN,NNP,I-NP,True,False,LUCKY,",",O
6,",",",",O,False,False,WIN,CHINA,O
7,CHINA,NNP,B-NP,True,False,",",IN,B-PER
8,IN,IN,B-PP,True,False,CHINA,SURPRISE,O
9,SURPRISE,DT,B-NP,True,False,IN,DEFEAT,O
