# Feature Engineering and Syntatic Similarity

## Blueprint: Building your own Vectorizer

In [40]:
sentences = [
    "It was the best of times",
    "it was the worst of times",
    "it was the age of wisdom",
    "it was the age of foolishness",
]
tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
vocabulary = set([w for s in tokenized_sentences for w in s])

In [9]:
import pandas as pd

pd.DataFrame([[w, i] for i, w in enumerate(vocabulary)])

Unnamed: 0,0,1
0,worst,0
1,times,1
2,foolishness,2
3,the,3
4,was,4
5,best,5
6,of,6
7,wisdom,7
8,age,8
9,It,9


## Vectorizing Documents

In [12]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]


onehot = [
    onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences
]

for sentence, oh in zip(sentences, onehot):
    print(f"{oh}: {sentence}")

[0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0]: It was the best of times
[1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1]: it was the worst of times
[0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1]: it was the age of wisdom
[0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1]: it was the age of foolishness


**Out-of-vocabulary documents**

In [14]:
onehot_encode("the age of wisdom is the best of times".split())

[0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0]

In [15]:
onehot_encode("John likes to watch movies. Mary likes movies too.".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

### The Document–Term Matrix

In [18]:
pd.DataFrame(onehot, columns=vocabulary)

Unnamed: 0,worst,times,foolishness,the,was,best,of,wisdom,age,It,it
0,0,1,0,1,1,1,1,0,0,1,0
1,1,1,0,1,1,0,1,0,0,0,1
2,0,0,0,1,1,0,1,1,1,0,1
3,0,0,1,1,1,0,1,0,1,0,1


**Calculating similarities**

Calculate similarity between the first and second sentences.

In [21]:
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
sum(sim)

4

In [23]:
import numpy as np

np.dot(onehot[0], onehot[1])

4

### The Similarity Matrix

In [26]:
np.dot(onehot, np.transpose(onehot))

array([[6, 4, 3, 3],
       [4, 6, 4, 4],
       [3, 4, 6, 5],
       [3, 4, 5, 6]])

**One-Hot Encoding with scikit-learn**

In [42]:
from sklearn.preprocessing import MultiLabelBinarizer

lb = MultiLabelBinarizer()
lb.fit([vocabulary])
lb.transform(tokenized_sentences)

array([[1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0]])

In [45]:
pd.DataFrame(lb.transform(tokenized_sentences), columns=lb.classes_)

Unnamed: 0,It,age,best,foolishness,it,of,the,times,was,wisdom,worst
0,1,0,1,0,0,1,1,1,1,0,0
1,0,0,0,0,1,1,1,1,1,0,1
2,0,1,0,0,1,1,1,0,1,1,0
3,0,1,0,1,1,1,1,0,1,0,0


## Bag-of-Words Models


### Blueprint: Using scikit-learn's CountVectorizer

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [47]:
more_sentences = sentences + [
    "John likes to watch movies. Mary like movies too.",
    "Mary also likes to watch football games.",
]

**Fitting the vocabulary**

In [48]:
cv.fit(more_sentences)

CountVectorizer()

In [49]:
cv.get_feature_names()

['age',
 'also',
 'best',
 'foolishness',
 'football',
 'games',
 'it',
 'john',
 'like',
 'likes',
 'mary',
 'movies',
 'of',
 'the',
 'times',
 'to',
 'too',
 'was',
 'watch',
 'wisdom',
 'worst']

**Transforming the documents to vectors**

In [50]:
dt = cv.transform(more_sentences)
dt

<6x21 sparse matrix of type '<class 'numpy.int64'>'
	with 39 stored elements in Compressed Sparse Row format>

In [51]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,like,likes,...,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,...,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,...,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,1,1,...,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


### Blueprint: Calculating Similarities

In [53]:
from sklearn.metrics.pairwise import cosine_similarity

# Check the similarity between the first two sentences.
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [54]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.833333,0.666667,0.666667,0.0,0.0
1,0.833333,1.0,0.666667,0.666667,0.0,0.0
2,0.666667,0.666667,1.0,0.833333,0.0,0.0
3,0.666667,0.666667,0.833333,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.455842
5,0.0,0.0,0.0,0.0,0.455842,1.0


## TF-IDF Models

### Optimized Document Vectors with TfidfTransformer

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,like,likes,...,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,...,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321251,0.321251,0.263431,...,0.642503,0.0,0.0,0.0,0.263431,0.321251,0.0,0.263431,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.0,0.343777,...,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [57]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.362246
5,0.0,0.0,0.0,0.0,0.362246,1.0


## Introducing the ABC Dataset

In [61]:
headlines = pd.read_csv("data/abcnews-date-text.csv.gz", parse_dates=["publish_date"])
print(len(headlines))
headlines.head()

1103663


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95878 sparse matrix of type '<class 'numpy.float64'>'
	with 7001357 stored elements in Compressed Sparse Row format>

In [63]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: user 380 ms, sys: 296 ms, total: 676 ms
Wall time: 923 ms


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16913596,
        0.16792138],
       [0.        , 0.        , 0.        , ..., 0.16913596, 1.        ,
        0.33258708],
       [0.        , 0.        , 0.        , ..., 0.16792138, 0.33258708,
        1.        ]])

In [66]:
%%time
from sklearn.metrics.pairwise import linear_kernel

linear_kernel(dt[0:10000], dt[0:10000])

CPU times: user 366 ms, sys: 319 ms, total: 685 ms
Wall time: 1.06 s


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16913596,
        0.16792138],
       [0.        , 0.        , 0.        , ..., 0.16913596, 1.        ,
        0.33258708],
       [0.        , 0.        , 0.        , ..., 0.16792138, 0.33258708,
        1.        ]])

### Blueprint: Removing Feature Dimensions

**Removing Stop Words**

In [67]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

print(len(stopwords))

326


In [68]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

**Minimum Frequency**

In [71]:
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

In [72]:
# When using float for min_df, the word has to occur in a minimum fraction of documents.
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=0.0001)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x6772 sparse matrix of type '<class 'numpy.float64'>'
	with 4816381 stored elements in Compressed Sparse Row format>

**Maximum Frequency**

In [73]:
# Eliminate all the words that appears in at least 10% of the headlines
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>