In [13]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import spacy
import string

punctuations = string.punctuation
parser = spacy.load('en')


# Custom transformer using spaCy
class Predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}


# Basic utility function to clean the text
def clean_text(text):
    return text.strip().lower()

In [14]:
# Create spacy tokenizer that parses a sentence and generates tokens
# these can also be replaced by word vectors
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in STOPWORDS and tok not in punctuations)]
    return tokens


# Create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()

In [15]:
# Create the  pipeline to clean, tokenize, vectorize, and classify
pipe = Pipeline([("cleaner", Predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Load sample data
train = [('I was born in the New York but I am now residing in Singapore.', 'pos'),
         ('I am a U.S. resident.', 'pos'),
         ('I am a U.S. green card holder. ', 'pos'),
         ('We stay in Texas most of the time.', 'pos'),
         ('My dad is a U.S. citizen.', 'pos'),
         ('I was not born in the U.S.', 'neg'),
         ('My Singapore number is 97372309', 'neg'),
         ("I have not been to New York", 'neg'),
         ('My dad is not a U.S. citizen', 'neg'),
         ('My hold mail address is in Hongkong' , 'neg')]
test = [('I live in New York', 'pos'),
        ('I was born in Singapore', 'neg'),
        ("I don't have a U.S. account", 'neg'),
        ("We reside in the U.S.", 'pos'),
        ('My wife was born in New York but now lives in Singapore.', 'pos'),
        ("My dad is not a U.S. citizen", 'neg')]

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train])
pred_data = pipe.predict([x[0] for x in test])
for (sample, pred) in zip(test, pred_data):
    print(sample, pred)

print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

('I live in New York', 'pos') neg
('I was born in Singapore', 'neg') neg
("I don't have a U.S. account", 'neg') neg
('We reside in the U.S.', 'pos') pos
('My wife was born in New York but now lives in Singapore.', 'pos') neg
('My dad is not a U.S. citizen', 'neg') neg
Accuracy: 0.6666666666666666
