In [1]:
import pandas as pd
import numpy as np

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/Drive')

Mounted at /Drive


# Vectorization

In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [3]:
txt1 = ["data science is a good subject but data analysis is not easy"]
txt2 = ["Gopi is a good scientist","Artificial Intelligence is a combination of art and science and it has a good future"]


> ### Count Vectorizer

___

In [4]:
vectorizer = CountVectorizer()
vectorizer.fit(txt2)

In [5]:
# List of Vocabulary

vectorizer.vocabulary_

{'gopi': 6,
 'is': 9,
 'good': 5,
 'scientist': 13,
 'artificial': 2,
 'intelligence': 8,
 'combination': 3,
 'of': 11,
 'art': 1,
 'and': 0,
 'science': 12,
 'it': 10,
 'has': 7,
 'future': 4}

In [6]:
sorted(vectorizer.vocabulary_.items(), key=lambda x: x[0])

[('and', 0),
 ('art', 1),
 ('artificial', 2),
 ('combination', 3),
 ('future', 4),
 ('good', 5),
 ('gopi', 6),
 ('has', 7),
 ('intelligence', 8),
 ('is', 9),
 ('it', 10),
 ('of', 11),
 ('science', 12),
 ('scientist', 13)]

In [7]:
print(txt2)

['Gopi is a good scientist', 'Artificial Intelligence is a combination of art and science and it has a good future']


In [8]:
a = vectorizer.transform(txt2) # replace with txt2 and see what happens
a.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1],
       [2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0]])

***

> ### TF-IDF
- TF is counting the frequency of occurence of the word
- IDF (Inverse Document Frequency) is to supress the effect of words occuring all documents. Frequently occuring words have IDF value of 1

---

In [None]:
# Explanation

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(txt2)

In [None]:
sorted(vectorizer.vocabulary_.items(), key=lambda x: x[0])

[('and', 0),
 ('art', 1),
 ('artificial', 2),
 ('combination', 3),
 ('future', 4),
 ('good', 5),
 ('gopi', 6),
 ('has', 7),
 ('intelligence', 8),
 ('is', 9),
 ('it', 10),
 ('of', 11),
 ('science', 12),
 ('scientist', 13)]

In [None]:

vectorizer.idf_

array([1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
       1.        , 1.40546511, 1.40546511, 1.40546511, 1.        ,
       1.40546511, 1.40546511, 1.40546511, 1.40546511])

In [None]:
print(txt1)
print(vectorizer.transform(txt1).toarray())


# we can see that frequently occuring words "is", "good" are assigned IDF value 1

['data science is a good subject but data analysis is not easy']
[[0.         0.         0.         0.         0.         0.37863221
  0.         0.         0.         0.75726441 0.         0.
  0.53215436 0.        ]]


In [None]:
print(txt2)
print(vectorizer.transform(txt2).toarray())

# some values are 0 because these words are not present in the document being fitted in the model


['Gopi is a good scientist', 'Artificial Intelligence is a combination of art and science and it has a good future']
[[0.         0.         0.         0.         0.         0.40993715
  0.57615236 0.         0.         0.40993715 0.         0.
  0.         0.57615236]
 [0.53428425 0.26714212 0.26714212 0.26714212 0.26714212 0.19007382
  0.         0.26714212 0.26714212 0.19007382 0.26714212 0.26714212
  0.26714212 0.        ]]


___

---

### NLTK: It is a leading platform for building Python programs to work with human language data. It is a suite of libraries and programs for symbolic and statistical natural language processing for English written in the Python programming language

# Tokenizer usnig NLTK

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
txt = ("Apple is looking at buying U.K. startup for $1 billion. India is a great country")
sent_tokenize(txt)

['Apple is looking at buying U.K. startup for $1 billion.',
 'India is a great country']

In [None]:
word_tokenize(txt)

['Apple',
 'is',
 'looking',
 'at',
 'buying',
 'U.K.',
 'startup',
 'for',
 '$',
 '1',
 'billion',
 '.',
 'India',
 'is',
 'a',
 'great',
 'country']

# NER (Named Entity Recognition)

In [12]:
# pip install -U spacy
# python -m spacy download en_core_web_sm

In [13]:
import spacy

# spaCy provides a variety of linguistic annotations to give you insights into a text’s grammatical structure.
# This includes the word types, like the parts of speech, and how the words are related to each other.

In [14]:
# Load English tokenizer, tagger, parser and NER

nlp = spacy.load('en_core_web_sm')

In [15]:
txt = ("Apple is looking at buying U.K. startup for $1 billion")

In [16]:
doc = nlp(txt)
doc

Apple is looking at buying U.K. startup for $1 billion

In [17]:
for i in doc.ents:
  print(i.text, '|', i.label_,'|', spacy.explain(i.label_))

Apple | ORG | Companies, agencies, institutions, etc.
U.K. | GPE | Countries, cities, states
$1 billion | MONEY | Monetary values, including unit


# POS
### Part-of-speech (POS) tagging is a popular Natural Language Processing process which refers to categorizing words in a text (corpus) in correspondence with a particular part of speech, depending on the definition of the word and its context. Part of Speech (hereby referred to as POS) Tags are useful for building parse trees, which are used in building NERs (most named entities are Nouns) and extracting relations between words. POS Tagging is also essential for building lemmatizers which are used to reduce a word to its root form.

In [18]:
for token in doc:
    print("Token:",token.text,'|', token.pos_,'|', token.dep_)

Token: Apple | PROPN | nsubj
Token: is | AUX | aux
Token: looking | VERB | ROOT
Token: at | ADP | prep
Token: buying | VERB | pcomp
Token: U.K. | PROPN | dobj
Token: startup | NOUN | dep
Token: for | ADP | prep
Token: $ | SYM | quantmod
Token: 1 | NUM | compound
Token: billion | NUM | pobj


# Visualising Spacy

In [19]:
from spacy import displacy

In [20]:
txt = ("Apple is looking at buying U.K. startup for $1 billion")
doc = nlp(txt)

In [21]:
displacy.render(doc, style="dep", jupyter=True)

In [22]:
displacy.render(doc, style="ent", jupyter=True)

# Word2Vec (CBOW / SkipGram)
## Represents words in the form of vectors

## Using GENSIM

In [23]:
import gensim

In [24]:
# lets do a preprocessing using GENSIM
# Removes punctuation, converts to lowercase

msg_txt = gensim.utils.simple_preprocess("Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...")

In [25]:
msg_txt

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [26]:
model = gensim.models.Word2Vec(
    window=10, # 10 words to slide
    min_count=2, # minimum words for a sentence to be considered
)

In [27]:
msg_txt = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",
    "Ok I am going "]

In [28]:
model.build_vocab(msg_txt)



In [29]:
model.corpus_count

2

In [30]:
model.train(msg_txt, total_examples=model.corpus_count, epochs = 5)

(76, 625)