In [None]:
#Building vocabulary (Word Tokenization)

sentence = 'My name is Akshay Santosh Rane.'
sentence.split()

['My', 'name', 'is', 'Akshay', 'Santosh', 'Rane.']

In [None]:
import numpy as np

token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
','.join(vocab)
num_tokens = len(token_sequence)
vocab_size = len(vocab)
onehot_vectors = np.zeros((num_tokens, vocab_size), int)

for i, word in enumerate(token_sequence):
  onehot_vectors[i, vocab.index(word)] = 1

' '.join(vocab)

onehot_vectors

array([[0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0]])

In [None]:
import pandas as pd

pd.DataFrame(onehot_vectors, columns = vocab)

Unnamed: 0,Akshay,My,Rane.,Santosh,is,name
0,0,1,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,1,0
3,1,0,0,0,0,0
4,0,0,0,1,0,0
5,0,0,1,0,0,0


In [None]:
sentence_bow = {}
for token in sentence.split():
  sentence_bow[token] = 1

sorted(sentence_bow.items())



[('Akshay', 1),
 ('My', 1),
 ('Rane.', 1),
 ('Santosh', 1),
 ('is', 1),
 ('name', 1)]

In [None]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T

In [None]:
df

Unnamed: 0,My,name,is,Akshay,Santosh,Rane.
sent,1,1,1,1,1,1


In [None]:
sentences = '''Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis auctor elementum leo, eu porta magna dignissim nec.\n'''
sentences += '''Duis elit turpis, hendrerit vel mauris a, eleifend pretium odio.\n'''
sentences += '''Praesent nisi turpis, condimentum sed nunc vel, ornare hendrerit odio. Sed at tellus ac enim blandit blandit.\n'''
sentences += ''' Nullam lobortis eleifend justo, sed consequat nibh dignissim eget. Mauris a justo eget eros sollicitudin semper. Sed ac imperdiet diam.\n'''

corpus = {}

for i, sent in enumerate(sentences.split('\n')):
  corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())

df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T

In [None]:
df[df.columns[:10]]

Unnamed: 0,Lorem,ipsum,dolor,sit,"amet,",consectetur,adipiscing,elit.,Duis,auctor
sent0,1,1,1,1,1,1,1,1,1,1
sent1,0,0,0,0,0,0,0,0,1,0
sent2,0,0,0,0,0,0,0,0,0,0
sent3,0,0,0,0,0,0,0,0,0,0
sent4,0,0,0,0,0,0,0,0,0,0


In [None]:
#Dot product calculation to check the similarities
import numpy as np

v1 = np.array([1,2,3])
v2 = np.array([3,4,5])

v1.dot(v2)

(v1 * v2).sum()

26

In [None]:
#Measuring the bag_words overlap

df = df.T

df['sent0'].dot(df['sent1'])

1

In [None]:
#Tokenize with re

import re
sentence = """This will reshape the array to have a single dimension, while keeping the total number of elements unchanged."""

tokens = re.split(r'[-\s.,;?]+', sentence)
tokens

['This',
 'will',
 'reshape',
 'the',
 'array',
 'to',
 'have',
 'a',
 'single',
 'dimension',
 'while',
 'keeping',
 'the',
 'total',
 'number',
 'of',
 'elements',
 'unchanged',
 '']

In [None]:
#Compile function

pattern = re.compile(r'[-\s.,;?]+')
tokens = pattern.split(sentence)
tokens[-10:]

['dimension',
 'while',
 'keeping',
 'the',
 'total',
 'number',
 'of',
 'elements',
 'unchanged',
 '']

In [None]:
[x for x in tokens if x and x not in '-\t\n.,;?']

['This',
 'will',
 'reshape',
 'the',
 'array',
 'to',
 'have',
 'a',
 'single',
 'dimension',
 'while',
 'keeping',
 'the',
 'total',
 'number',
 'of',
 'elements',
 'unchanged']

In [None]:
#Tokenization using NLTK

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|[$[0-9.]+|\S+]')
tokenizer.tokenize(sentence)

['This',
 'will',
 'reshape',
 'the',
 'array',
 'to',
 'have',
 'a',
 'single',
 'dimension',
 'while',
 'keeping',
 'the',
 'total',
 'number',
 'of',
 'elements',
 'unchanged',
 '.']

In [None]:
#Best tokenizer is Treebank tokenizer from NLTK package
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['This',
 'will',
 'reshape',
 'the',
 'array',
 'to',
 'have',
 'a',
 'single',
 'dimension',
 ',',
 'while',
 'keeping',
 'the',
 'total',
 'number',
 'of',
 'elements',
 'unchanged',
 '.']

In [None]:
#casual_tokenize is used to deal with shorthand, informal, emoticon-laced text
from nltk.tokenize.casual import casual_tokenize

message = """RT @ akshay rane best day everrrrr at mumbai.
Awesommmmmeeeeeee day eveeerrrr:*"""

casual_tokenize(message)

#We can reduce the len
casual_tokenize(message, reduce_len=True, strip_handles = True)

['RT',
 '@',
 'akshay',
 'rane',
 'best',
 'day',
 'everrr',
 'at',
 'mumbai',
 '.',
 'Awesommmeee',
 'day',
 'eveeerrr',
 ':',
 '*']

In [None]:
#2-grams : pair of word eg. ice cream
#3-grams : triplet of word eg. beyond the pale

from nltk.util import ngrams

list(ngrams(tokens, 2))
list(ngrams(tokens, 3))

[('This', 'will', 'reshape'),
 ('will', 'reshape', 'the'),
 ('reshape', 'the', 'array'),
 ('the', 'array', 'to'),
 ('array', 'to', 'have'),
 ('to', 'have', 'a'),
 ('have', 'a', 'single'),
 ('a', 'single', 'dimension'),
 ('single', 'dimension', 'while'),
 ('dimension', 'while', 'keeping'),
 ('while', 'keeping', 'the'),
 ('keeping', 'the', 'total'),
 ('the', 'total', 'number'),
 ('total', 'number', 'of'),
 ('number', 'of', 'elements'),
 ('of', 'elements', 'unchanged'),
 ('elements', 'unchanged', '')]

In [None]:
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
print(len(stop_words))
stop_words[:7]

179


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [None]:
#Case Folding

normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

['this', 'will', 'reshape', 'the', 'array', 'to', 'have', 'a', 'single', 'dimension', 'while', 'keeping', 'the', 'total', 'number', 'of', 'elements', 'unchanged', '']


In [None]:
#Stemming

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "Dish wahsers's washed dishes".split()])

'dish wahsers wash dish'

In [None]:
#Lemmatization

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("better")

'better'

In [None]:
lemmatizer.lemmatize("better", pos="a")

'good'

In [None]:
lemmatizer.lemmatize("good", pos="a") # a is adjective
lemmatizer.lemmatize("goods", pos="a")
lemmatizer.lemmatize("goods", pos="n")  #n for noun
lemmatizer.lemmatize("goodness", pos="n")
lemmatizer.lemmatize("best", pos="a")

'best'

In [None]:
#Sentiment Analysis

#VADER - A rule based sentiment analyzer
#VADER - Valence Aware Dictionary for sEntiment Reasoning



In [None]:
#Sentiment analysis by Naive analysis

