In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv("ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)
data.head()
# drop null rows and check if any null values remaining
data.dropna(inplace=True)
data[data.isnull().any(axis=1)].size

data_small = data[:20000]
data_valid = data[20000:30000]

preds = list(data.columns.values)
preds.remove('tag')
y_small = data_small['tag']
x_small = data_small[preds]

# Split into train and test data

x_train, x_test, y_train, y_test = train_test_split(x_small, y_small, test_size=0.2, random_state=0)
print(x_train.shape)
print(y_train.shape)

pos_list = list(set(data['pos']))
shape_list = list(set(data['shape']))
word_list = list(set(data['word']))
tag_list = list(set(data['tag'].values))

print(len(word_list))


Skipping line 281837: expected 25 fields, saw 34



(16000, 24)
(16000,)
30172


In [3]:
# INITIAL STATE PROBABILITIES

initial_tag_probs = {}

for tag in tag_list:
    prob = 1.0*len(data_small[data_small['tag'] == tag]) / len(data_small)
    initial_tag_probs[tag] = prob
    
print(initial_tag_probs)

pred_b = []
for i in range(len(data_small)):
    pred_b.append('O')
    
print()
print("BASELINE ALL 'O' CLASSIFIER")
print("Accuracy Score: " + str(accuracy_score(data_small['tag'], pred_b)))
print("F1 Score: " + str(f1_score(data_small['tag'], pred_b, labels=tag_list, average="weighted")))

{u'I-art': 0.0011, u'B-nat': 0.00045, u'B-gpe': 0.02535, u'B-art': 0.00185, u'I-tim': 0.0025, u'B-org': 0.01895, u'I-per': 0.01775, u'B-geo': 0.02555, u'I-org': 0.01395, u'I-geo': 0.00395, u'O': 0.8556, u'I-eve': 0.0007, u'B-eve': 0.0009, u'I-gpe': 0.0013, u'B-tim': 0.0156, u'I-nat': 0.00025, u'B-per': 0.01425}
()
BASELINE ALL 'O' CLASSIFIER
Accuracy Score: 0.8556
F1 Score: 0.789018495365


  'precision', 'predicted', average, warn_for)


In [4]:
# TRANSITION PROBABILITIES, from tag to tag

transition_probs = {}
for tag1 in tag_list:
    within_tag = {}
    data_tag1 = data_small[data_small['tag'] == tag1]
    for tag2 in list(set(data_small['prev-iob'])):
        to_tag2 = data_tag1[data_tag1['prev-iob'] == tag2]
        within_tag[tag2] = len(to_tag2)*1.0/len(data_tag1)
    transition_probs[tag1] = within_tag

# remake with a very small amount of hallucination
alpha = 0.01

transition_probs2 = {}
for tag1 in tag_list:
    within_tag = {}
    data_tag1 = data_small[data_small['tag'] == tag1]
    for tag2 in tag_list:
        to_tag2 = data_tag1[data_tag1['prev-iob'] == tag2]
        within_tag[tag2] = (alpha + len(to_tag2)*1.0)/(len(data_tag1) + alpha*len(tag_list))
    transition_probs2[tag1] = within_tag
    
#print(transition_probs)


In [5]:
# EMISSION PROBABILITIES (this is the part we will use ML for)

alpha = 0.01

word_emission_probs = {}
for word in list(set(data_small['word'])):
    word_data = data_small[data_small['word'] == word]
    tag_probs = {}
    for tag in tag_list:
        tag_data = word_data[word_data['tag'] == tag]
        prob = (alpha + 1.0*len(tag_data) / (len(word_data) + alpha*len(tag_list)))
        tag_probs[tag] = prob
    word_emission_probs[word] = tag_probs

In [6]:
alpha = 0.01

pos_emission_probs = {}
for pos in pos_list:
    pos_data = data_small[data_small['pos'] == pos]
    tag_probs = {}
    for tag in tag_list:
        tag_data = pos_data[pos_data['tag'] == tag]
        prob = (alpha + 1.0*len(tag_data) / (len(pos_data) + alpha*len(tag_list)))
        tag_probs[tag] = prob
    pos_emission_probs[word] = tag_probs

In [7]:
alpha = 0.01

shape_emission_probs = {}
for shape in shape_list:
    shape_data = data_small[data_small['shape'] == shape]
    tag_probs = {}
    for tag in tag_list:
        tag_data = shape_data[shape_data['tag'] == tag]
        prob = (alpha + 1.0*len(tag_data) / (len(shape_data) + alpha*len(tag_list)))
        tag_probs[tag] = prob
    shape_emission_probs[word] = tag_probs

In [8]:
# ML section, takes in tag and returns feature prediction

# predictors, one hot encode for training
predictor = data_small['tag']
pred_final = pd.get_dummies(predictor)

response_1 = data_small['word']
response_2 = data_small['pos']
response_3 = data_small['shape']

classify = RandomForestClassifier()
classify.fit(pred_final, response_2)
pred = classify.predict(pred_final)
print("CLASSIFYING TAG --> POS")
print("Train Acc: " + str(accuracy_score(pred, response_2)))
pred2 = classify.predict(pd.get_dummies(data_valid['tag']))
print("Valid Acc: " + str(accuracy_score(pred2, data_valid['pos'])))


classify2 = RandomForestClassifier()
classify2.fit(pred_final, response_1)
pred = classify2.predict(pred_final)
print("CLASSIFYING TAG --> WORD")
print("Train Acc: " + str(accuracy_score(pred, response_1)))
pred2 = classify2.predict(pd.get_dummies(data_valid['tag']))
print("Valid Acc: " + str(accuracy_score(pred2, data_valid['word'])))

classify3 = RandomForestClassifier()
classify3.fit(pred_final, response_3)
pred = classify3.predict(pred_final)
print("CLASSIFYING TAG --> SHAPE")
print("Train Acc: " + str(accuracy_score(pred, response_3)))
pred2 = classify3.predict(pd.get_dummies(data_valid['tag']))
print("Valid Acc: " + str(accuracy_score(pred2, data_valid['shape'])))





CLASSIFYING TAG --> POS
Train Acc: 0.25735
Valid Acc: 0.2623
CLASSIFYING TAG --> WORD
Train Acc: 0.06225
Valid Acc: 0.0596
CLASSIFYING TAG --> SHAPE
Train Acc: 0.83445
Valid Acc: 0.8215


In [9]:
predictor = data_small['tag']
pred_final = pd.get_dummies(predictor)
pred_final.head()

Unnamed: 0,B-art,B-eve,B-geo,B-gpe,B-nat,B-org,B-per,B-tim,I-art,I-eve,I-geo,I-gpe,I-nat,I-org,I-per,I-tim,O
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [10]:
target = pd.get_dummies(pd.DataFrame(tag_list))
target.head()

Unnamed: 0,0_B-art,0_B-eve,0_B-geo,0_B-gpe,0_B-nat,0_B-org,0_B-per,0_B-tim,0_I-art,0_I-eve,0_I-geo,0_I-gpe,0_I-nat,0_I-org,0_I-per,0_I-tim,0_O
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [11]:
def features_from_tag(test_data):
    response_1 = data_small['word']
    response_2 = data_small['pos']
    response_3 = data_small['shape']
    
    predictor = data_small['tag']
    pred_final = pd.get_dummies(predictor)
    
    classify1 = RandomForestClassifier()
    classify2 = RandomForestClassifier()
    classify3 = RandomForestClassifier()

    
    classify1.fit(pred_final, response_1)
    classify2.fit(pred_final, response_2)
    classify3.fit(pred_final, response_3)
    
    # get likelihoods for each tag
    
    #for tag in tag_list:
    target = pd.get_dummies(pd.DataFrame(tag_list))
    #print(list(set(target)))
    #target = pd.get_dummies(pd.DataFrame(test_data['tag']))
    
    p1 = classify1.predict_proba(target)
    p2 = classify2.predict_proba(target)
    p3 = classify3.predict_proba(target)
    
    emission_probs = {}
    for i in range(len(tag_list)):
        word_preds = {}
        words = classify1.classes_
        for j in range(len(words)):
            p = p1[i][j]
            word_preds[words[j]] = p
            
        pos_preds = {}
        poss = classify2.classes_
        for k in range(len(poss)):
            pos_preds[poss[k]] = p2[i][k]
        
        shape_preds = {}
        shapes = classify3.classes_
        for l in range(len(shapes)):
            shape_preds[shapes[l]] = p3[i][l]
        
        emission_probs[list(set(data_small['tag']))[i]] = [word_preds, pos_preds, shape_preds]
        
    return emission_probs, classify1.classes_, classify2.classes_, classify3.classes_
    

f, final_word_list, final_pos_list, final_shape_list = features_from_tag(data_valid)
print(f['O'][1])    
    

{u'PRP$': 0.0089220369693593458, u'VBG': 0.021212983413126614, u'VBD': 0.043422368456786112, u'``': 0.003913410978814722, u'VBN': 0.036630834286552519, u',': 0.03353431476800374, u'VBP': 0.020815102977365812, u'WDT': 0.0032597265065999019, u'JJ': 0.068888722474133357, u'WP': 0.0024248124350451567, u'VBZ': 0.027569796022639249, u'DT': 0.11161884561362109, u'RP': 0.0026228080561461944, u'$': 0.0013434493480027013, u'NN': 0.15877304924469465, u'POS': 0.010930919799457352, u'.': 0.053138638223153532, u'TO': 0.025726521829419112, u'PRP': 0.018806379996655034, u'RB': 0.022685700421624953, u';': 9.3578874612571824e-05, u':': 0.0010166839485315367, u'NNS': 0.086957967825612298, u'NNP': 0.0016943207104910811, u'VB': 0.028416428758626794, u'WRB': 0.003569436748889797, u'RRB': 0.00061322433632206037, u'CC': 0.024268637460954673, u'PDT': 0.00017515726216841713, u'RBS': 0.00039158358535533889, u'RBR': 0.0012091936537193539, u'CD': 0.021400854735720447, u'NNPS': 7.0016489137145205e-05, u'EX': 0.0009

In [12]:
# try with knn
# doesn't seem to be better than random forest

#classify = KNeighborsClassifier(1000)
#classify.fit(pred_final, response_2)
#pred = classify.predict(pred_final)
#print("CLASSIFYING TAG --> POS")
#print("Train Acc: " + str(accuracy_score(pred, response_2)))
#pred2 = classify.predict(pd.get_dummies(data_valid['tag']))
#print("Valid Acc: " + str(accuracy_score(pred2, data_valid['pos'])))


In [13]:
def viterbi_prediction():
    train_prediction = []
    sentence_indices = list(set(data_small['sentence_idx']))
    
    count = 0
    #print(f['B-gpe'][2])
    #print(f['B-geo'][2])
    #print(f['B-per'][2])
    for index, row in data_small.iterrows():
        word = row['word']
        pos = row['pos']
        shape = row['shape']
        max_tag = 'O'
        #print(list(f['O'][1].keys()))
        if word in list(f['O'][0].keys()):
            if pos in list(f['O'][1].keys()):
                if shape in list(f['O'][2].keys()):
                    max_prob = -1000000
                    max_tag = 'O'
                    for tag in tag_list:
                        # p(e|x)
                        emission = 1.0*f[tag][0][word]*f[tag][1][pos]*f[tag][2][shape] * initial_tag_probs[tag]
                
                        # transition model
                        prev_tag = row['prev-iob']
                        transition_prob = transition_probs[tag][prev_tag]
                    
                        prob = emission * transition_prob
                    
                        if prob > max_prob:
                            max_prob = prob
                            max_tag = tag
                            if max_tag == 'camelcase':
                                count += 1
        else: 
            max_tag = 'O'
            max_prob = -1
            max_tag = 'O'
            for tag in tag_list:
                # p(e|x)
                emission = 1.0*f[tag][1][pos]*f[tag][2][shape] * initial_tag_probs[tag]
                
                # transition model
                prev_tag = row['prev-iob']
                transition_prob = transition_probs[tag][prev_tag]
                    
                prob = emission * transition_prob
            
                if prob > max_prob:
                    max_prob = prob
                    max_tag = tag
            
        train_prediction.append(max_tag)
        
    print("Training Accuracy: " + str(accuracy_score(train_prediction, data_small['tag'])))
    print("Training F1 Score: " + str(f1_score(data_small['tag'], train_prediction, labels=tag_list, average="weighted")))
    valid_prediction = []
    count = 0
    #print(f['B-gpe'][2])
    #print(f['B-geo'][2])
    #print(f['B-per'][2])
    for index, row in data_valid.iterrows():
        word = row['word']
        pos = row['pos']
        shape = row['shape']
        max_tag = 'O'
        #print(list(f['O'][1].keys()))
        if word in list(f['O'][0].keys()):
            if pos in list(f['O'][1].keys()):
                if shape in list(f['O'][2].keys()):
                    max_prob = -1000000
                    max_tag = 'O'
                    for tag in tag_list:
                        # p(e|x)
                        emission = 1.0*f[tag][0][word]*f[tag][1][pos]*f[tag][2][shape] * initial_tag_probs[tag]
                
                        # transition model
                        prev_tag = row['prev-iob']
                        transition_prob = transition_probs[tag][prev_tag]
                    
                        prob = emission * transition_prob
                    
                        if prob > max_prob:
                            max_prob = prob
                            max_tag = tag
                            if max_tag == 'camelcase':
                                count += 1
        else: 
            max_tag = 'O'
            max_prob = -1
            max_tag = 'O'
            for tag in tag_list:
                # p(e|x)
                emission = 1.0*f[tag][1][pos]*f[tag][2][shape] * initial_tag_probs[tag]
                
                # transition model
                prev_tag = row['prev-iob']
                transition_prob = transition_probs[tag][prev_tag]
                    
                prob = emission * transition_prob
            
                if prob > max_prob:
                    max_prob = prob
                    max_tag = tag
            
        valid_prediction.append(max_tag)
        
    print("Validation Accuracy: " + str(accuracy_score(valid_prediction, data_valid['tag'])))
    print("Validation F1 Score: " + str(f1_score(data_valid['tag'], valid_prediction, labels=tag_list, average="weighted")))
    

In [14]:
viterbi_prediction()

Training Accuracy: 0.9815
Training F1 Score: 0.981618788639
Validation Accuracy: 0.9334
Validation F1 Score: 0.934761335015


In [16]:
print(list(set(data_valid['shape'])))

[u'mixedcase', u'lowercase', u'camelcase', u'uppercase', u'capitalized', u'number', u'abbreviation', u'punct', u'other', u'ending-dot', u'contains-hyphen']


In [None]:
viterbi_prediction