<a href="https://colab.research.google.com/github/alextanhongpin/blueprints-for-text-analytics-python/blob/master/05_feature_engineering_and_syntatic_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering and Syntatic Similarity

## Blueprint: Building your own Vectorizer

In [49]:
sentences = [
    "It was the best of times",
    "it was the worst of times",
    "it was the age of wisdom",
    "it was the age of foolishness",
]
tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
vocabulary = set([w for s in tokenized_sentences for w in s])

In [50]:
import pandas as pd

pd.DataFrame([[w, i] for i, w in enumerate(vocabulary)])

Unnamed: 0,0,1
0,of,0
1,wisdom,1
2,was,2
3,it,3
4,times,4
5,foolishness,5
6,best,6
7,the,7
8,age,8
9,worst,9


## Vectorizing Documents

In [51]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]


onehot = [
    onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences
]

for sentence, oh in zip(sentences, onehot):
    print(f"{oh}: {sentence}")

[1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1]: It was the best of times
[1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0]: it was the worst of times
[1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0]: it was the age of wisdom
[1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0]: it was the age of foolishness


**Out-of-vocabulary documents**

In [52]:
onehot_encode("the age of wisdom is the best of times".split())

[1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0]

In [53]:
onehot_encode("John likes to watch movies. Mary likes movies too.".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

### The Document–Term Matrix

In [54]:
pd.DataFrame(onehot, columns=vocabulary)

Unnamed: 0,of,wisdom,was,it,times,foolishness,best,the,age,worst,It
0,1,0,1,0,1,0,1,1,0,0,1
1,1,0,1,1,1,0,0,1,0,1,0
2,1,1,1,1,0,0,0,1,1,0,0
3,1,0,1,1,0,1,0,1,1,0,0


**Calculating similarities**

Calculate similarity between the first and second sentences.

In [55]:
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
sum(sim)

4

In [56]:
import numpy as np

np.dot(onehot[0], onehot[1])

4

### The Similarity Matrix

In [57]:
np.dot(onehot, np.transpose(onehot))

array([[6, 4, 3, 3],
       [4, 6, 4, 4],
       [3, 4, 6, 5],
       [3, 4, 5, 6]])

**One-Hot Encoding with scikit-learn**

In [58]:
from sklearn.preprocessing import MultiLabelBinarizer

lb = MultiLabelBinarizer()
lb.fit([vocabulary])
lb.transform(tokenized_sentences)

array([[1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0]])

In [59]:
pd.DataFrame(lb.transform(tokenized_sentences), columns=lb.classes_)

Unnamed: 0,It,age,best,foolishness,it,of,the,times,was,wisdom,worst
0,1,0,1,0,0,1,1,1,1,0,0
1,0,0,0,0,1,1,1,1,1,0,1
2,0,1,0,0,1,1,1,0,1,1,0
3,0,1,0,1,1,1,1,0,1,0,0


## Bag-of-Words Models


### Blueprint: Using scikit-learn's CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [None]:
more_sentences = sentences + [
    "John likes to watch movies. Mary like movies too.",
    "Mary also likes to watch football games.",
]

**Fitting the vocabulary**

In [None]:
cv.fit(more_sentences)

CountVectorizer()

In [None]:
cv.get_feature_names()

['age',
 'also',
 'best',
 'foolishness',
 'football',
 'games',
 'it',
 'john',
 'like',
 'likes',
 'mary',
 'movies',
 'of',
 'the',
 'times',
 'to',
 'too',
 'was',
 'watch',
 'wisdom',
 'worst']

**Transforming the documents to vectors**

In [None]:
dt = cv.transform(more_sentences)
dt

<6x21 sparse matrix of type '<class 'numpy.int64'>'
	with 39 stored elements in Compressed Sparse Row format>

In [None]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,like,likes,...,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,...,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,...,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,1,1,...,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


### Blueprint: Calculating Similarities

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Check the similarity between the first two sentences.
cosine_similarity(dt[0], dt[1])

array([[0.]])

In [None]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.833333,0.666667,0.666667,0.0,0.0
1,0.833333,1.0,0.666667,0.666667,0.0,0.0
2,0.666667,0.666667,1.0,0.833333,0.0,0.0
3,0.666667,0.666667,0.833333,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.455842
5,0.0,0.0,0.0,0.0,0.455842,1.0


## TF-IDF Models

### Optimized Document Vectors with TfidfTransformer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,like,likes,...,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321251,0.321251,0.263431,...,0.642503,0.0,0.0,0.0,0.263431,0.321251,0.0,0.263431,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.0,0.343777,...,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [None]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.362246
5,0.0,0.0,0.0,0.0,0.362246,1.0


## Introducing the ABC Dataset

In [3]:
!mkdir data
!ls data
!curl -L https://github.com/alextanhongpin/blueprints-for-text-analytics-python/blob/master/data/abcnews-date-text.csv.gz?raw=true -o data/abcnews-date-text.csv.gz
import pandas as pd 

headlines = pd.read_csv("data/abcnews-date-text.csv.gz", parse_dates=["publish_date"])
print(len(headlines))
headlines.head()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   177  100   177    0     0    811      0 --:--:-- --:--:-- --:--:--   811
100   196  100   196    0     0    531      0 --:--:-- --:--:-- --:--:--   531
100 18.3M  100 18.3M    0     0  10.4M      0  0:00:01  0:00:01 --:--:-- 49.8M
1103663


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95878 sparse matrix of type '<class 'numpy.float64'>'
	with 7001357 stored elements in Compressed Sparse Row format>

In [60]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: user 183 ms, sys: 125 ms, total: 308 ms
Wall time: 308 ms


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.05313723,
        0.05382327],
       [0.        , 0.        , 0.        , ..., 0.05313723, 1.        ,
        0.17069258],
       [0.        , 0.        , 0.        , ..., 0.05382327, 0.17069258,
        1.        ]])

In [6]:
%%time
from sklearn.metrics.pairwise import linear_kernel

linear_kernel(dt[0:10000], dt[0:10000])

CPU times: user 291 ms, sys: 360 ms, total: 650 ms
Wall time: 676 ms


### Blueprint: Removing Feature Dimensions

**Removing Stop Words**

In [7]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

print(len(stopwords))

326


In [8]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

  % sorted(inconsistent)


<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

**Minimum Frequency**

In [9]:
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

  % sorted(inconsistent)


<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

In [10]:
# When using float for min_df, the word has to occur in a minimum fraction of documents.
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=0.0001)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

  % sorted(inconsistent)


<1103663x6772 sparse matrix of type '<class 'numpy.float64'>'
	with 4816381 stored elements in Compressed Sparse Row format>

**Maximum Frequency**

In [11]:
# Eliminate all the words that appears in at least 10% of the headlines
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

  % sorted(inconsistent)


<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

### Blueprints: Improving features by making them more specific

**Performing linguistic analysis**

In [12]:
# Google Colab is using the old version. Install and restart runtime.
# You need to enable the GPU runtime too.
#!pip install -U spacy

import spacy

spacy.prefer_gpu()

True

In [24]:
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")
nouns_adjective_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]


lemmas = []
navs = []
for doc in tqdm(nlp.pipe(headlines['headline_text'].values), total=len(headlines)):
  lemmas.append(" ".join([token.lemma_ for token in doc]))
  navs.append(" ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjective_verbs]))
headlines['lemmas'] = lemmas
headlines['nav'] = navs

# The approach below is too slow.
# with tqdm(total=len(headlines)) as pbar:
#     for i, row in headlines.iterrows():
#         doc = nlp(str(row["headline_text"]))
#         headlines.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
#         headlines.at[i, "nav"] = " ".join(
#             [token.lemma_ for token in doc if token.pos_ in nouns_adjective_verbs]
#         )
#         pbar.update(1)

100%|██████████| 1103663/1103663 [14:04<00:00, 1307.07it/s]


**Blueprint: Using lemmas instead of Words for Vectorizing Documents**

In [28]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt

  % sorted(inconsistent)


<1103663x86434 sparse matrix of type '<class 'numpy.float64'>'
	with 5578953 stored elements in Compressed Sparse Row format>

**Blueprint: Limit Word Types**

In [29]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

  % sorted(inconsistent)


<1103663x83491 sparse matrix of type '<class 'numpy.float64'>'
	with 5451900 stored elements in Compressed Sparse Row format>

**Blueprint: Remove Most Common Words**

In [30]:
top_10000 = pd.read_csv(
    "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt",
    header=None,
)
tfidf = TfidfVectorizer(stop_words=set(top_10000.iloc[:, 0].values))
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

<1103663x75455 sparse matrix of type '<class 'numpy.float64'>'
	with 1377264 stored elements in Compressed Sparse Row format>

**Blueprint: Adding Context via N-Grams**

In [31]:
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 2), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)

tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 3), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)

  % sorted(inconsistent)


(1103663, 559961)
67325400


  % sorted(inconsistent)


(1103663, 747988)
72360104


## Syntatic Similarity in the ABC Dataset

In [35]:
# There are "test" headlines in the corpus.
stopwords.add('test')
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 2), min_df=2, norm='l2')
dt = tfidf.fit_transform(headlines['headline_text'])
dt

  % sorted(inconsistent)


<1103663x559346 sparse matrix of type '<class 'numpy.float64'>'
	with 8405225 stored elements in Compressed Sparse Row format>

### Blueprint: Finding Most Similar Headlines to a Made-up Headline

In [36]:
made_up = tfidf.transform(['australia and new zealand discuss optimal apple size'])

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(made_up, dt)

In [39]:
import numpy as np

headlines.iloc[np.argmax(sim)]

publish_date     2015-06-04 00:00:00
headline_text       new zealand wool
lemmas              new zealand wool
nav                 new zealand wool
Name: 957797, dtype: object

### Blueprint: Finding the two most similar documents in a large corpus (Much more difficult)

In [41]:
%%time
np.dot(dt[0:10000], np.transpose(dt[0:10000]))

CPU times: user 30.6 ms, sys: 997 µs, total: 31.6 ms
Wall time: 32.9 ms


<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1817473 stored elements in Compressed Sparse Row format>

In [42]:
%%time

batch = 10000
max_sim = 0.0


max_a = None
max_b = None

for a in range(0, dt.shape[0], batch):
  for b in range(0, a+batch, batch):
    r = np.dot(dt[a:a+batch], np.transpose(dt[b:b+batch]))
    # Eliminate identical vectors by setting their similarity to np.nan which gets sorted out.
    r[r>0.9999] = np.nan
    sim = r.max()
    if sim > max_sim:
      # argmax returns a single value which we have to map to the two dimensions.
      (max_a, max_b) = np.unravel_index(np.argmax(r), r.shape)

      # Adjust offsets in corpus (this is a submatrix).
      max_a += a
      max_b += b
      max_sim = sim

CPU times: user 7min 16s, sys: 3.15 s, total: 7min 19s
Wall time: 7min 40s


In [43]:
print(headlines.iloc[max_a])
print(headlines.iloc[max_b])

publish_date                                2014-09-18 00:00:00
headline_text    vline fails to meet punctuality targets report
lemmas             vline fail to meet punctuality target report
nav                   vline fail meet punctuality target report
Name: 904965, dtype: object
publish_date                         2008-02-15 00:00:00
headline_text    vline fails to meet punctuality targets
lemmas             vline fail to meet punctuality target
nav                   vline fail meet punctuality target
Name: 364042, dtype: object


### Blueprint: Finding Related Words

In [45]:
tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)
dt_word = tfidf_word.fit_transform(headlines['headline_text'])
dt_word

  % sorted(inconsistent)


<1103663x1132 sparse matrix of type '<class 'numpy.float64'>'
	with 2980495 stored elements in Compressed Sparse Row format>

In [46]:
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r, 0)

In [48]:
voc = tfidf_word.get_feature_names_out()
size = r.shape[0] # Quadratic.
for index in np.argsort(r.flatten())[::-1][0:40]:
  a = int(index/size)
  b = index%size
  if a > b: # Avoid repetitions.
    print('"%s" related to "%s"' % (voc[a], voc[b]))

"sri" related to "lanka"
"hour" related to "country"
"seekers" related to "asylum"
"springs" related to "alice"
"pleads" related to "guilty"
"hill" related to "broken"
"trump" related to "donald"
"violence" related to "domestic"
"climate" related to "change"
"driving" related to "drink"
"care" related to "aged"
"gold" related to "coast"
"royal" related to "commission"
"mental" related to "health"
"wind" related to "farm"
"flu" related to "bird"
"murray" related to "darling"
"world" related to "cup"
"north" related to "korea"
"hour" related to "2014"
