# EECS 498 - Assignment 1 - SBD.py

### <span style="color:red">Major Task:</span> write a Python program SBD.py that detects sentence boundaries in text.

### <span style="color:red">Result:</span> built a program that predicts if a period (.) is the end of a sentence or not with 99.2% accuracy.

### 1) Read in & process data

In [16]:
import csv
import sys
import pandas as pd
import numpy as np
import sklearn

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

file1 = sys.argv[1]
file2 = sys.argv[2]

file1 = 'SBD.train'
file2 = 'SBD.test'

with open(file1) as train:
    trainFile = train.read()
with open(file2) as test:
    testFile = test.read()
    
trainData = pd.read_csv(trainFile, header=None,delimiter=r"\s+", quoting=csv.QUOTE_NONE)
testData = pd.read_csv(testFile, header=None,delimiter=r"\s+", quoting=csv.QUOTE_NONE)


# save period indices, fix labels
testData.columns = ['line', 'word', 'label']
savedTest = testData
testData = testData.drop('line', 1)

# fix train column labels
trainData.columns = ['line', 'word', 'label']
trainData = trainData.drop('line', 1)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


### 2) Create Feature Vectors

In [17]:
## Extract the following features from the data for each instance of a '.' with an NEOS/EOS label:

## -------- core features -----------
## L - word to the left of the '.'
## R - word to the right of the '.'
## LLENGTH - length of L is less than 3 (bool)
## LCAP - L is capitalized (bool)
## RCAP - R is capitalized (bool)

## ----- my self-selected features ------
## LALLCAPS - L is all capital letters (bool)
## RQUOTE - R is a quotation (") mark (bool)
## RLENGTH - length of R is less than 5 (bool)

def get_features_and_labels(data):
    
    features_list = []
    labels_list = []

    for index, row in data.iterrows():
        if (data.loc[index, 'label'] == 'EOS') or (data.loc[index, 'label'] == 'NEOS'):

            features_dict = {}

            # 5 core features
            left = data.loc[index, 'word'][:-1] # remove trailing period
            right = data.loc[index+1, 'word'] if (index != len(data)-1) else 'LAST' # check for last line
            lLength = True if (len(left) < 3) else False
            lCap = True if ((len(left) > 0) and (left[0].isupper())) else False
            rCap = True if ((right != 'LAST') and (right[0].isupper())) else False

            # 3 self-chosen features
            lAllCaps = True if ((len(left) > 0) and (left.isupper())) else False
            rQuote = True if (right == '"') else False
            rLength = True if ((right != 'LAST') and (len(right) < 5)) else False

            # Store Features in Dictionary
            features_dict['LEFT'] = left
            features_dict['RIGHT'] = right
            features_dict['LLENGTH'] = lLength
            features_dict['LCAP'] = lCap
            features_dict['RCAP'] = rCap
            features_dict['LALLCAPS'] = lAllCaps
            features_dict['RQUOTE'] = rQuote
            features_dict['RLENGTH'] = rLength

            # Store Features & Labels in lists
            features_list.append(features_dict)
            labels_list.append(data.loc[index, 'label'])
    
    return features_list, labels_list

### 3) Implement Decision Tree Algorithm

In [18]:
def build_decision_tree():
    
    # extract features and labels from train & test data
    X_train, Y_train = get_features_and_labels(trainData)
    X_test, Y_test = get_features_and_labels(testData)
    
    # encode features & labels
    dictVec = DictVectorizer()
    labelEn = LabelEncoder()    
    X_train = dictVec.fit_transform(X_train) # NOTE: train data = fit.transform
    Y_train = labelEn.fit_transform(Y_train)
    X_test = dictVec.transform(X_test) # NOTE: test data = transform
    Y_test = labelEn.transform(Y_test)
    
    entropy_clf = DecisionTreeClassifier(criterion = "entropy")
    entropy_clf.fit(X_train, Y_train)
    Y_predict = entropy_clf.predict(X_test)
    
    # print accuracy
    print "Accuracy is ", accuracy_score(Y_test,Y_predict)*100
    
    # return predictions
    return Y_predict

In [19]:
Y_predict = build_decision_tree()

Accuracy is  99.1875746714


In [15]:
#def generate_output():

    # insert in prediction labels to original test data
#    indexList = savedTest.index[(savedTest['label'] == 'NEOS') or (savedTest['label'] == 'EOS')].tolist()
#    count = 0
#    for index in enumerate(savedTest['label']):
#        if ((label == 'NEOS') or (label == 'EOS')):
#            savedTest[index] = Y_predict[count]
#            count += 1
#    
#    # output predictions into txt file
#    np.savetxt(r'SBD.test.out', savedTest.values, fmt='%d')