### Feature Extraction and Preprocessing

In [9]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

** DictVectorizer **

In [None]:
onehot_encoder = DictVectorizer()
instances = [
    {'city': 'New York'},
    {'city': 'San Francisco'},
    {'city': 'Chapel Hill'} ]

print (onehot_encoder.fit_transform(instances).toarray())

** CountVectorizer **

In [7]:
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]


vectorizer = CountVectorizer()
print (vectorizer.fit_transform(corpus).todense())
print (vectorizer.vocabulary_)


[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'played': 5, 'the': 6, 'duke': 1, 'in': 3, 'lost': 4, 'unc': 7, 'basketball': 0, 'game': 2}


In [11]:
# adding one more sentence in corpus

corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'This is Atul Singh'
]

vectorizer = CountVectorizer()
print (vectorizer.fit_transform(corpus).todense())
print (vectorizer.vocabulary_)

[[0 1 1 0 1 0 0 1 0 0 0 1]
 [0 1 1 1 0 0 1 0 0 1 0 0]
 [1 0 0 0 0 1 0 0 1 0 1 0]]
{'played': 7, 'lost': 6, 'the': 9, 'duke': 2, 'unc': 11, 'this': 10, 'singh': 8, 'is': 5, 'atul': 0, 'in': 4, 'basketball': 1, 'game': 3}


In [13]:
# checking the euclidean distance 

# converting sentence into CountVectorizer
counts = vectorizer.fit_transform(corpus).todense()

print("1 & 2", euclidean_distances(counts[0], counts[1]))
print("2 & 3", euclidean_distances(counts[1], counts[2]))
print("1 & 3", euclidean_distances(counts[0], counts[2]))

1 & 2 [[ 2.44948974]]
2 & 3 [[ 3.]]
1 & 3 [[ 3.]]


** Stop Word Filtering **

In [16]:
vectorizer = CountVectorizer(stop_words='english')  # added one option which remove the grammer words from corpus
print (vectorizer.fit_transform(corpus).todense())
print (vectorizer.vocabulary_)

print("1 & 2", euclidean_distances(counts[0], counts[1]))
print("2 & 3", euclidean_distances(counts[1], counts[2]))
print("1 & 3", euclidean_distances(counts[0], counts[2]))

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'played': 5, 'duke': 2, 'singh': 6, 'lost': 4, 'game': 3, 'unc': 7, 'basketball': 1, 'atul': 0}
1 & 2 [[ 2.44948974]]
2 & 3 [[ 3.]]
1 & 3 [[ 3.]]


** Stemming and Lemmatization **  

**Lemmatization** is the process of determining the lemma, or the morphological root, of an inflected word based on its context. Lemmas are the base forms of words that are used to key the word in a dictionary.

**Stemming** has a similar goal to lemmatization, but it does not attempt to produce the morphological roots of words. Instead, stemming removes all patterns of characters that appear to be affixes, resulting in a token that is not necessarily a valid word.

Lemmatization frequently requires a lexical resource, like WordNet, and the word's part of speech. Stemming 
algorithms frequently use rules instead of lexical resources to produce stems and can 
operate on any token, even without its context.

In [18]:
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]

vectorizer = CountVectorizer(stop_words='english')  # added one option which remove the grammer words from corpus
print (vectorizer.fit_transform(corpus).todense())
print (vectorizer.vocabulary_)

[[1 0 0 1]
 [0 1 1 0]]
{'sandwich': 2, 'eaten': 1, 'ate': 0, 'sandwiches': 3}
