## Import Libraries

In [12]:
from nltk.corpus import names
import random
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## 1) Simple Text Classification

In [13]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [14]:
gender_features("Obama")

{'last_letter': 'a'}

In [15]:
print(len(names.words()))
labelled_names = ([(name,'male') for name in names.words('male.txt')] + [(name,'female') for name in names.words('female.txt')])

7944


In [16]:
random.shuffle(labelled_names)
featuresets = [(gender_features(n),gender) for (n,gender) in labelled_names]

In [17]:
train_set, test_set=featuresets[:5000], featuresets[5000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

### Now we can try to classify names

In [18]:
# My name
name = "Aryan"
print("Name: {} -> Predicted Gender: {}".format(name, classifier.classify(gender_features(name))))

# My Friend's name
name = "Shourya"
print("Name: {} -> Predicted Gender: {}".format(name, classifier.classify(gender_features(name))))

Name: Aryan -> Predicted Gender: male
Name: Shourya -> Predicted Gender: female


In [19]:
print(nltk.classify.accuracy(classifier,test_set))

0.7683423913043478


## 2) Count Vectorizer

In [20]:
vect = CountVectorizer(binary=True)
corpus = ["Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book."]
vect.fit(corpus)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [22]:
vocab = vect.vocabulary_
print(len(vocab))

31


In [23]:
for key in sorted(vocab.keys()):
     print("{}:{}".format(key, vocab[key]))

1500s:0
an:1
and:2
been:3
book:4
dummy:5
ever:6
galley:7
has:8
industry:9
ipsum:10
is:11
it:12
lorem:13
make:14
of:15
printer:16
printing:17
scrambled:18
simply:19
since:20
specimen:21
standard:22
text:23
the:24
to:25
took:26
type:27
typesetting:28
unknown:29
when:30


In [24]:
print(vect.transform(["This is a good optical illusion"]).toarray())

[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


## 3) Finding Similarity Between Documents

In [25]:
similarity = cosine_similarity(vect.transform(["Google Cloud Vision is a character recognition engine"]).toarray(), vect.transform(["OCR is an optical character recognition engine"]).toarray())
print(similarity)

[[0.70710678]]
