In [1]:
import seaborn as sns
sns.set()

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp('Here is a text document. spaCy requires it to be a unicode string.')
# doc.sents is a generator producing sentences
for sentence in doc.sents:
    print(sentence)
# doc can be indexed to find the individual words
print("Word 3:", doc[3])

Here is a text document.
spaCy requires it to be a unicode string.
Word 3: text


In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

# Spit out (slightly cleaned up) sentences from a Wikipedia article.
def wikipedia_to_sents(url):
    soup = BeautifulSoup(urlopen(url), 'lxml').find(attrs={'id':'mw-content-text'})
    
    # The text is littered by references like [n].  Drop them.
    def drop_refs(s):
        return ''.join(re.split('\[\d+\]', s))
    
    paragraphs = [drop_refs(p.text) for p in soup.find_all('p')]
    return [s.text for paragraph in paragraphs for s in nlp(paragraph).sents if len(s) > 2]

fruit_sents = wikipedia_to_sents("http://en.wikipedia.org/wiki/Apple")
company_sents = wikipedia_to_sents("http://en.wikipedia.org/wiki/Apple_Inc.")

In [5]:
import numpy as np
np.shape(fruit_sents)

(263,)

In [6]:
company_sents

['\nCoordinates: 37°20′06″N 122°00′32″W\ufeff / \ufeff37.3349°N 122.0090°W\ufeff / 37.3349; -122.0090\n',
 'Apple Inc. is an American multinational technology company that specializes in consumer electronics, software and online services headquartered in Cupertino, California, United States.',
 "Apple is the largest information technology company by revenue (totaling US$365.8 billion in 2021) and as of May 2022, it is the world's second most valuable company, the fourth-largest personal computer vendor by unit sales and second-largest mobile phone manufacturer.",
 'It is one of the Big Five American information technology companies, alongside Alphabet, Amazon, Meta, and Microsoft.',
 "Apple was founded as Apple Computer Company on April 1, 1976, by Steve Jobs, Steve Wozniak and Ronald Wayne to develop and sell Wozniak's Apple I personal computer.",
 "It was incorporated by Jobs and Wozniak as Apple Computer, Inc. in 1977 and the company's next computer, the Apple II became a best selle

In [7]:
company_sents[-105:-100]

['According to The New York Times, in the 1980s Apple was among the first tech companies to designate overseas salespeople in high-tax countries in a manner that allowed the company to sell on behalf of low-tax subsidiaries on other continents, sidestepping income taxes.',
 'In the late 1980s, Apple was a pioneer of an accounting technique known as the "Double Irish with a Dutch sandwich," which reduces taxes by routing profits through Irish subsidiaries and the Netherlands and then to the Caribbean.\n',
 'British Conservative Party Member of Parliament Charlie Elphicke published research on October 30, 2012, which showed that some multinational companies, including Apple Inc., were making billions of pounds of profit in the UK, but were paying an effective tax rate to the UK Treasury of only 3 percent, well below standard corporation tax.',
 'He followed this research by calling on the Chancellor of the Exchequer George Osborne to force these multinationals, which also included Google

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

bag_of_words_vectorizer = CountVectorizer()

counts = bag_of_words_vectorizer.fit_transform( fruit_sents + company_sents  )
print(counts.shape)

(879, 4433)


In [9]:
# Note that counts is a **sparse** matrix.
print(counts.toarray())       #This is what it actually looks like.. there are non-zero entries, really!
print()
print(counts)                 # .. this is just describing the non-zero entries

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

  (0, 381)	3
  (0, 432)	2
  (0, 2240)	1
  (0, 1411)	1
  (0, 1778)	1
  (0, 3136)	1
  (0, 721)	1
  (0, 4128)	1
  (0, 2498)	1
  (0, 1329)	1
  (1, 432)	1
  (1, 2498)	1
  (1, 4129)	1
  (1, 456)	2
  (1, 1135)	1
  (1, 4388)	1
  (1, 393)	1
  (1, 4022)	2
  (1, 2633)	1
  (1, 4349)	1
  (1, 1919)	1
  (1, 3770)	1
  (1, 2084)	1
  (1, 1835)	1
  (2, 2240)	1
  :	:
  (877, 3183)	1
  (877, 3360)	1
  (877, 331)	1
  (877, 2689)	1
  (877, 2548)	1
  (877, 1054)	1
  (877, 4287)	1
  (877, 3501)	1
  (877, 2395)	1
  (877, 1428)	1
  (878, 432)	2
  (878, 4022)	2
  (878, 2084)	1
  (878, 4074)	1
  (878, 1934)	1
  (878, 2116)	1
  (878, 1453)	1
  (878, 1077)	1
  (878, 1876)	1
  (878, 4385)	1
  (878, 4194)	1
  (878, 960)	1
  (878, 3501)	1
  (878, 4051)	1
  (878, 472)	1


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS

ng_tfidf = TfidfVectorizer(max_features=300)
ng_tfidf.fit(fruit_sents + company_sents)
print(ng_tfidf.get_feature_names()[100:105])
print(ng_tfidf.transform(fruit_sents + company_sents))

['fruit', 'generally', 'golden', 'government', 'grown']
  (0, 266)	0.3339572646676034
  (0, 207)	0.3768935052344948
  (0, 160)	0.3633460249864958
  (0, 132)	0.20851273528055003
  (0, 100)	0.2891776056930707
  (0, 55)	0.18924604915531656
  (0, 30)	0.21380435139195805
  (0, 25)	0.6370054010844136
  (1, 295)	0.34292864671804807
  (1, 267)	0.3348622418015332
  (1, 252)	0.18340022265472483
  (1, 173)	0.2998782027158974
  (1, 160)	0.3685896348723833
  (1, 117)	0.11768956671223807
  (1, 104)	0.37511697982221487
  (1, 72)	0.37511697982221487
  (1, 33)	0.4362241613194968
  (1, 30)	0.10844492906812103
  (1, 26)	0.11626767578403316
  (2, 288)	0.3753033016756095
  (2, 284)	0.36181298147681573
  (2, 266)	0.3325482193446823
  (2, 252)	0.09001417170234922
  (2, 242)	0.35596354076089687
  (2, 231)	0.36822031883518686
  :	:
  (875, 53)	0.2271975344657579
  (875, 15)	0.20427935337442263
  (875, 10)	0.21535569312904337
  (876, 252)	0.2903680579792504
  (876, 50)	0.956915038498904
  (877, 264)	0.109192471



In [11]:
STOP_WORDS = STOP_WORDS.union({'ll', 've', 'pron'})

In [12]:
print('six' in STOP_WORDS, 'seven' in STOP_WORDS, 'eight' in STOP_WORDS)

True False True


In [13]:
counter = CountVectorizer(max_features=300,
                          stop_words=STOP_WORDS.union({'apple'}))
counter.fit(fruit_sents + company_sents)
print(counter.get_feature_names())

# Now we can use it with that vectorizer, like so...
counter.transform(fruit_sents + company_sents)

['000', '10', '100', '13', '14', '19', '1984', '1985', '1997', '20', '2001', '2006', '2007', '2008', '2010', '2011', '2012', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '30', '500', 'according', 'allowed', 'america', 'american', 'announced', 'app', 'apples', 'applications', 'apps', 'april', 'asia', 'audio', 'august', 'away', 'based', 'began', 'best', 'better', 'billion', 'board', 'brand', 'business', 'called', 'came', 'carbon', 'central', 'century', 'ceo', 'china', 'cider', 'city', 'climate', 'companies', 'company', 'computer', 'computers', 'conditions', 'consumer', 'consumers', 'continues', 'cook', 'cost', 'created', 'cultivar', 'cultivars', 'cultivated', 'data', 'davidson', 'day', 'december', 'design', 'designed', 'desktop', 'despite', 'developed', 'development', 'device', 'devices', 'different', 'digital', 'disease', 'early', 'efforts', 'electronics', 'employees', 'end', 'energy', 'enforcement', 'europe', 'european', 'features', 'following', 'food', 'form



<879x300 sparse matrix of type '<class 'numpy.int64'>'
	with 3951 stored elements in Compressed Sparse Row format>

In [14]:
ng_counter = CountVectorizer(max_features=300, 
                             ngram_range=(2,2), 
                             stop_words=STOP_WORDS.union({'apple', 'Apple'}))
ng_counter.fit( fruit_sents + company_sents  )
print(ng_counter.get_feature_names())
print()
print(len(ng_counter.get_feature_names()))

['000 m2', '000 square', '000 time', '000 units', '100 000', '100 million', '100 renewable', '13 billion', '17th century', '1976 steve', '19th century', '2011 jobs', '2016 update', '2017 announced', '2019 update', '2020 announced', '2021 update', '2022 update', '21 2016', '27 2010', '30 2019', '3349 122', '37 3349', '500 known', '500 list', '65 billion', '86 million', 'according report', 'active use', 'allowed company', 'anti competitive', 'app store', 'apps app', 'apps earth', 'april 1976', 'april 2022', 'april 24', 'arctic apples', 'arsenic free', 'asia europe', 'athena aphrodite', 'atmosphere facilities', 'audio editor', 'audio player', 'august 2018', 'august 24', 'backlit lcd', 'board directors', 'boardroom coup', 'brand loyalty', 'broad line', 'carbon dioxide', 'cash reserves', 'central asia', 'ceo michael', 'ceo steve', 'ceo tim', 'chain management', 'chairman board', 'chief operating', 'child labor', 'chinese government', 'climate counts', 'commonly known', 'company history', 'c



In [15]:
print([w.lemma_ for w in nlp('carry carries carrying carried')])
print([w.lemma_ for w in nlp('eat eating eaten ate')])
print(' '.join(w.lemma_ for w in nlp("The quick brown fox jumped over the lazy dog.  "
                                     "I can't believe it's not butter.  "
                                     "I tried to ford the river and my unfortunate oxen died.")))

['carry', 'carry', 'carry', 'carry']
['eat', 'eat', 'eaten', 'eat']
the quick brown fox jump over the lazy dog .   I can not believe it be not butter .   I try to ford the river and my unfortunate oxen die .


In [16]:
def tokenize_lemma(text):
    return [w.lemma_.lower() for w in nlp(text)]

stop_words_lemma = set(tokenize_lemma(' '.join(STOP_WORDS)))

ng_stem_tfidf = TfidfVectorizer(max_features=300, 
                                stop_words=stop_words_lemma.union({'apple'}),
                                tokenizer=tokenize_lemma,
                                token_pattern=None        # Is ignored, since tokenizer is specified
                               )
ng_stem_tfidf = ng_stem_tfidf.fit(fruit_sents + company_sents)

ng_stem_vocab = ng_stem_tfidf.get_feature_names()
print(ng_stem_vocab)

['\n', '"', '$', '%', "'s", '(', ')', ',', '-', '.', '1', '10', '100', '19', '1997', '2', '2006', '2007', '2008', '2010', '2011', '2012', '2014', '2015', '2016', '2017', '2019', '2020', '2021', '3', '30', '6', ':', ';', ']', 'accord', 'advertisement', 'allow', 'america', 'american', 'announce', 'app', 'application', 'april', 'attempt', 'audio', 'august', 'away', 'base', 'beat', 'begin', 'big', 'billion', 'board', 'brand', 'build', 'business', 'campaign', 'campus', 'carbon', 'cause', 'center', 'central', 'century', 'ceo', 'change', 'china', 'claim', 'climate', 'clone', 'color', 'come', 'commit', 'company', 'computer', 'condition', 'consumer', 'contain', 'continue', 'control', 'cook', 'corporation', 'cost', 'country', 'create', 'cultivar', 'cultivate', 'customer', 'datum', 'day', 'december', 'design', 'desktop', 'develop', 'development', 'device', 'different', 'disease', 'early', 'eat', 'effort', 'electronic', 'employee', 'end', 'energy', 'europe', 'european', 'executive', 'feature', 'fi



In [17]:
s1 = "I tried to ford the river, and my unfortunate oxen died."
s2 = "Henry Ford built factories to facilitate the construction of the Ford automobile."

In [18]:
[(w.text, w.pos_, w.tag_) for w in nlp(s1)]

[('I', 'PRON', 'PRP'),
 ('tried', 'VERB', 'VBD'),
 ('to', 'PART', 'TO'),
 ('ford', 'VERB', 'VB'),
 ('the', 'DET', 'DT'),
 ('river', 'NOUN', 'NN'),
 (',', 'PUNCT', ','),
 ('and', 'CCONJ', 'CC'),
 ('my', 'PRON', 'PRP$'),
 ('unfortunate', 'ADJ', 'JJ'),
 ('oxen', 'NOUN', 'NN'),
 ('died', 'VERB', 'VBD'),
 ('.', 'PUNCT', '.')]

In [19]:
[(w.text, w.pos_, w.tag_) for w in nlp(s2)]

[('Henry', 'PROPN', 'NNP'),
 ('Ford', 'PROPN', 'NNP'),
 ('built', 'VERB', 'VBD'),
 ('factories', 'NOUN', 'NNS'),
 ('to', 'PART', 'TO'),
 ('facilitate', 'VERB', 'VB'),
 ('the', 'DET', 'DT'),
 ('construction', 'NOUN', 'NN'),
 ('of', 'ADP', 'IN'),
 ('the', 'DET', 'DT'),
 ('Ford', 'PROPN', 'NNP'),
 ('automobile', 'NOUN', 'NN'),
 ('.', 'PUNCT', '.')]

In [20]:
def wikipedia_to_paragraphs(url):
    """
    Retrieves a URL from wikipedia, and returns a list of paragraphs 
    (based on the 'p' html paragraph tag) 
    """
    files_by_url = {
      "http://en.wikipedia.org/wiki/Ford_(crossing)": "ford_crossing.txt",
      "http://en.wikipedia.org/wiki/Ford": "ford_car.txt",
      "http://en.wikipedia.org/wiki/Apple": "apple_fruit.txt",
      "http://en.wikipedia.org/wiki/Apple_Inc.": "apple_inc.txt"
    }
    
    try:
        with open("small_data/{}".format(files_by_url[url]), encoding='utf-8') as wiki_file:
            soup = BeautifulSoup(wiki_file.read(), 'lxml')\
            .find(attrs={'id':'mw-content-text'})
    except KeyError:
        soup = BeautifulSoup(urlopen(url), 'lxml').find(attrs={'id':'mw-content-text'})
    
    # The text is littered by references like [n].  Drop them.
    def drop_refs(s):
        return ''.join( re.split('\[\d+\]', s) )
    
    return [drop_refs(p.text) for p in soup.find_all('p') if p.text != '']

In [21]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class AdHocFeatures(BaseEstimator, TransformerMixin):
    """
    Given a keyword (e.g., "apple"), will transform documents into an
    encoding of several ad hoc features of each occurrences of the keyword:
        - If the keyword is capitalized
        - If it is plural
        - If it is possessive (in the stupid sense of being followed by 's)
        - If the keyword is a verb (e.g., for Ford vs ford)
    """

    def __init__(self, keyword):
        self.keyword = nlp(keyword)[0].lemma_
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.asarray([self.transform_doc(x) for x in X])
    
    def feature_posessive(self, doc):
        ## N.B. spaCy will tokenize "Apple's" as ["Apple", "'s"]
        hits = [i for i, word in enumerate(doc) if word.lemma_ == self.keyword]
        return sum((i + 1) < len(doc) and doc[i+1].text == "'s" for i in hits)
    
    def transform_doc(self, row):
        doc = nlp(row)
        words = [word for word in doc if word.lemma_ == self.keyword]
        return [sum(word.is_title for word in words),
                sum(word.tag_ in ('NNS', 'NNPS') for word in words),
                self.feature_posessive(doc),
                sum(word.pos_ == 'VERB' for word in words)]

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion

def make_classifier(base_word, meaning1, meaning2):
    """
    Given
        - a base word (e.g., "apple", "ford") that can have ambiguous meaning
        - a pair meaning1 = (name1, url1) of a label for the first meaning, and a Wikipedia URL for it
        - a pair meaning2 = ... for the other meaning
    Returns a classifier that predicts the meaning
    """
    name1, url1 = meaning1
    name2, url2 = meaning2
    para1 = wikipedia_to_paragraphs(url1)
    para2 = wikipedia_to_paragraphs(url2)
    minlen = min(len(para1),len(para2))
    if len(para1) == minlen:
        para2 = para2[:minlen]
    else:
        para1 = para1[:minlen]
    
    def tokenize_lemma(text):
        return [w.lemma_.lower() for w in nlp(text)]

    stop_words_lemma = set(tokenize_lemma(' '.join(STOP_WORDS)))
    features = FeatureUnion([('stem_vectorizer',
                              TfidfVectorizer(ngram_range=(1,2),
                                              stop_words=stop_words_lemma.union({base_word}),
                                              tokenizer=tokenize_lemma)),
                             ('ad_hoc', AdHocFeatures(base_word))])
    pipe = Pipeline([('features', features),
                     ('classifier', MultinomialNB())])

    # Build the training data
    train_res  = [name1] * len(para1) + [name2] * len(para2)
    
    return pipe.fit(para1 + para2, train_res)

In [None]:
base_word = "apple"
options = [ ("fruit", "http://en.wikipedia.org/wiki/Apple"),
            ("company", "http://en.wikipedia.org/wiki/Apple_Inc.") ]
print(make_classifier(base_word, *options).predict([
    "I'm baking a pie with my granny smith apples.",
    "I looked up the recipe on my Apple iPhone.",
    "The apple pie recipe is on my desk.",
    "How is Apple's stock doing?",
    "I'm drinking apple juice.",
    "I have three apples.",
    "Steve Jobs is the CEO of apple.",
    "Steve Jobs likes to eat apples."
]))