# Tokenization

Tokenization is the process of breaking down the given text in natural language processing into the smallest unit in a sentence called a token. Punctuation marks, words, and numbers can be considered tokens.

In [6]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [7]:
text.split(' ')

['Hi',
 'Everyone!',
 'This',
 'is',
 'Hackers',
 'Realm.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing.',
 'We',
 'reached',
 '1000000',
 'views.']

In [8]:
from nltk import sent_tokenize, word_tokenize

In [9]:
# split the text into sentences
sent_tokens = sent_tokenize(text)
sent_tokens

['Hi Everyone!',
 'This is Hackers Realm.',
 'We are learning Natural Language Processing.',
 'We reached 1000000 views.']

In [10]:
# split the text into words
word_tokens = word_tokenize(text)
word_tokens

['Hi',
 'Everyone',
 '!',
 'This',
 'is',
 'Hackers',
 'Realm',
 '.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing',
 '.',
 'We',
 'reached',
 '1000000',
 'views',
 '.']

# Stemming

Stemming is the process of finding the root of words. A word stem need not be the same root as a dictionary-based morphological root, it just is an equal to or smaller form of the word.

In [13]:
from nltk.stem import PorterStemmer, SnowballStemmer
ps = PorterStemmer()

In [17]:
word = ('eats')
ps.stem(word)

'eat'

In [16]:
word = ('eating')
ps.stem(word)

'eat'

In [18]:
word = ('eaten')
ps.stem(word)

'eaten'

In [19]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [20]:
word_tokens = word_tokenize(text)

In [21]:
stemmed_sentence = " ".join(ps.stem(word) for word in word_tokens)
stemmed_sentence

'Hi everyon ! thi is hacker realm . We are learn natur languag process . We reach 1000000 view .'

# Lemmatization

Lemmatization is the process of finding the form of the related word in the dictionary. It is different from Stemming. It involves longer processes to calculate than Stemming.

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [30]:
lemmatizer.lemmatize('workers')

'worker'

In [31]:
lemmatizer.lemmatize('words')

'word'

In [37]:
lemmatizer.lemmatize('feet')

'foot'

In [39]:
lemmatizer.lemmatize('stripes', 'v')

'strip'

In [40]:
lemmatizer.lemmatize('stripes', 'n')

'stripe'

In [41]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [42]:
word_tokens = word_tokenize(text)

In [44]:
lemmatized_sentence = " ".join(lemmatizer.lemmatize(word.lower()) for word in word_tokens)
lemmatized_sentence

'hi everyone ! this is hacker realm . we are learning natural language processing . we reached 1000000 view .'

# Part of Speech Tagging (POS)

Part of Speech Tagging is a process of converting a sentence to forms — list of words, list of tuples (where each tuple is having a form (word, tag)). The tag in case of is a part-of-speech tag, and signifies whether the word is a noun, adjective, verb, and so on.

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [45]:
from nltk import pos_tag

In [51]:
pos_tag(['fighting'])

[('fighting', 'VBG')]

In [46]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [47]:
word_tokens = word_tokenize(text)

In [52]:
pos_tag(word_tokens)

[('Hi', 'NNP'),
 ('Everyone', 'NN'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('Hackers', 'NNP'),
 ('Realm', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('are', 'VBP'),
 ('learning', 'VBG'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('reached', 'VBD'),
 ('1000000', 'CD'),
 ('views', 'NNS'),
 ('.', '.')]

# Text Preprocessing (Clean Data)

In [9]:
import pandas as pd
import string
df = pd.read_csv('data/Twitter Sentiments.csv')
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)
df.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation


## Convert to lowercase

In [10]:
df['clean_text'] = df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,factsguide: society now #motivation,factsguide: society now #motivation


## Removal of Punctuations

In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

In [14]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,factsguide: society now #motivation,factsguide society now motivation


## Removal of Stopwords

In [17]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [18]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [19]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Frequent Words

In [23]:
from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1
        
word_count.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

In [24]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [25]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Rare Words

In [30]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [31]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [32]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Special characters

In [33]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [34]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


## Stemming

In [35]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [36]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...
2,bihday your majesty,bihday majesty,bihday majesti
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv


## Lemmatization & POS Tagging

In [41]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [42]:
wordnet.NOUN

'n'

In [43]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run,father dysfunctional selfish drag kid dysfunct...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty,bihday majesti,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv,factsguide society motivation


In [44]:
df.sample(frac=1).head(10)

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
21468,@user for real now: we will be playing @user ...,real playing czech republic china championship...,real play czech republ china championship wugc...,real play czech republic china championship wu...
9568,dear america. please don't let this influence ...,dear america please dont let influence vote tr...,dear america pleas dont let influenc vote trum...,dear america please dont let influence vote tr...
19804,finally... now on to other suppos~ #leagueof...,finally suppos leagueoflegends,final suppo leagueoflegend,finally suppos leagueoflegends
22323,@user @user @user @user feeling #worried.,feeling worried,feel worri,feel worry
20171,i am valued. #i_am #positive #affirmation,valued iam positive affirmation,valu iam posit affirm,value iam positive affirmation
29669,fathers day selfie â¤ï¸ #grandad #selfie #...,fathers selfie grandad selfie fathersday bless...,father selfi grandad selfi fathersday bless su...,father selfie grandad selfie fathersday bless ...
4360,when 8th #graders say they're for high #school,8th graders say theyre high school,8th grader say theyr high school,8th grader say theyre high school
15915,current mood ðð¦ #alone #anxiety #rain ...,current mood alone anxiety rain thistooshallpass,current mood alon anxieti rain thistooshallpass,current mood alone anxiety rain thistooshallpass
92,yes! received my acceptance letter for my mast...,yes received acceptance letter masters back oc...,ye receiv accept letter master back octob good...,yes receive acceptance letter master back octo...
18745,@user @user this so made me smile,made smile,made smile,make smile


## Removal of URLs

In [53]:
text = "https://www.hackersrealm.net is the URL of the channel Hackers Realm"

In [54]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

In [55]:
remove_url(text)

' is the URL of the channel Hackers Realm'

## Removal of HTML Tags

In [56]:
text = "<html><body> <h1>Hackers Realm</h1> <p>This is NLP text preprocessing tutorial</p> </body></html>"

In [57]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [58]:
remove_html_tags(text)

' Hackers Realm This is NLP text preprocessing tutorial '

## Spelling Correction

In [64]:
!pip install pyspellchecker

In [7]:
text = 'natur is a beuty'

In [8]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_text = spell.unknown(text.split())
    # print(misspelled_text)
    for word in text.split():
        if word in misspelled_text:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
            
    return " ".join(corrected_text)

In [9]:
correct_spellings(text)

'nature is a beauty'

# Feature Extraction from Text Data

## Bag of Words

A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things: A vocabulary of known words. A measure of the presence of known words.

In [5]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')

In [7]:
# fit the data
bow.fit(text_data)

CountVectorizer(stop_words='english')

In [8]:
# get the vocabulary list
bow.get_feature_names()

['extraction',
 'feature',
 'good',
 'important',
 'interested',
 'nlp',
 'topic',
 'tutorial']

In [9]:
bow_features = bow.transform(text_data)
bow_features

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [10]:
bow_feature_array = bow_features.toarray()
bow_feature_array

array([[0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 2, 0, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [11]:
print(bow.get_feature_names())
for sentence, feature in zip(text_data, bow_feature_array):
    print(sentence)
    print(feature)

['extraction', 'feature', 'good', 'important', 'interested', 'nlp', 'topic', 'tutorial']
I am interested in NLP
[0 0 0 0 1 1 0 0]
This is a good tutorial with good topic
[0 0 2 0 0 0 1 1]
Feature extraction is very important topic
[1 1 0 1 0 0 1 0]


## TF-IDF (Term Frequency/Inverse Document Frequency)

TF-IDF stands for term frequency-inverse document frequency and it is a measure, used in the fields of information retrieval (IR) and machine learning, that can quantify the importance or relevance of string representations (words, phrases, lemmas, etc)  in a document amongst a collection of documents

In [12]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [14]:
# fit the data
tfidf.fit(text_data)

TfidfVectorizer(stop_words='english')

In [15]:
# get the vocabulary list
tfidf.vocabulary_

{'interested': 4,
 'nlp': 5,
 'good': 2,
 'tutorial': 7,
 'topic': 6,
 'feature': 1,
 'extraction': 0,
 'important': 3}

In [16]:
tfidf_features = tfidf.transform(text_data)
tfidf_features

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [17]:
tfidf_feature_array = tfidf_features.toarray()
tfidf_feature_array

array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        , 0.        ],
       [0.        , 0.        , 0.84678897, 0.        , 0.        ,
        0.        , 0.32200242, 0.42339448],
       [0.52863461, 0.52863461, 0.        , 0.52863461, 0.        ,
        0.        , 0.40204024, 0.        ]])

In [19]:
for sentence, feature in zip(text_data, tfidf_features):
    print(sentence)
    print(feature)

I am interested in NLP
  (0, 5)	0.7071067811865476
  (0, 4)	0.7071067811865476
This is a good tutorial with good topic
  (0, 7)	0.42339448341195934
  (0, 6)	0.3220024178194947
  (0, 2)	0.8467889668239187
Feature extraction is very important topic
  (0, 6)	0.4020402441612698
  (0, 3)	0.5286346066596935
  (0, 1)	0.5286346066596935
  (0, 0)	0.5286346066596935


## Word2vec

The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.

In [20]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [21]:
# text data
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [23]:
# initialize and fit the data
model = Word2Vec(common_texts, size=100, min_count=1)

In [25]:
model.wv['graph']

array([-0.00042112,  0.00126945, -0.00348724,  0.00373327,  0.00387501,
       -0.00306736, -0.00138952, -0.00139083,  0.00334137,  0.00413064,
        0.00045129, -0.00390373, -0.00159695, -0.00369461, -0.00036086,
        0.00444261, -0.00391653,  0.00447466, -0.00032617,  0.00056412,
       -0.00017338, -0.00464378,  0.00039338, -0.00353649,  0.0040346 ,
        0.00179682, -0.00186994, -0.00121431, -0.00370716,  0.00039535,
       -0.00117291,  0.00498948, -0.00243317,  0.00480749, -0.00128626,
       -0.0018426 , -0.00086148, -0.00347201, -0.0025697 , -0.00409948,
        0.00433477, -0.00424404,  0.00389087,  0.0024296 ,  0.0009781 ,
       -0.00267652, -0.00039598,  0.00188174, -0.00141169,  0.00143257,
        0.00363962, -0.00445332,  0.00499313, -0.00013036,  0.00411159,
        0.00307077, -0.00048517,  0.00491026, -0.00315512, -0.00091287,
        0.00465486,  0.00034458,  0.00097905,  0.00187424, -0.00452135,
       -0.00365111,  0.00260027,  0.00464861, -0.00243504, -0.00

In [26]:
model.wv.most_similar('graph')

[('interface', 0.1710839718580246),
 ('user', 0.08987751603126526),
 ('trees', 0.07364125549793243),
 ('minors', 0.045832667499780655),
 ('computer', 0.025292515754699707),
 ('system', 0.012846874073147774),
 ('human', -0.03873271495103836),
 ('survey', -0.06853737682104111),
 ('time', -0.07515352964401245),
 ('eps', -0.07798048853874207)]

## Word Embedding using Glove

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space

Download link: https://www.kaggle.com/datasets/danielwillgeorge/glove6b100dtxt

In [28]:
import pandas as pd
import string
from nltk.corpus import stopwords
df = pd.read_csv('data/Twitter Sentiments.csv')
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)

df['clean_text'] = df['tweet'].str.lower()

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))

import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))

df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids ...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit can t use cause ...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


In [34]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [30]:
# tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])

word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

39085

In [40]:
# word_index

In [31]:
max(len(data) for data in df['clean_text'])

131

In [32]:
# padding text data
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded_seq = pad_sequences(sequences, maxlen=131, padding='post', truncating='post')

In [33]:
padded_seq[0]

array([    1,    28, 15330,  2630,  6365,   184,  7786,   385,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [35]:
# create embedding index
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [36]:
embedding_index['good']

array([-0.030769 ,  0.11993  ,  0.53909  , -0.43696  , -0.73937  ,
       -0.15345  ,  0.081126 , -0.38559  , -0.68797  , -0.41632  ,
       -0.13183  , -0.24922  ,  0.441    ,  0.085919 ,  0.20871  ,
       -0.063582 ,  0.062228 , -0.051234 , -0.13398  ,  1.1418   ,
        0.036526 ,  0.49029  , -0.24567  , -0.412    ,  0.12349  ,
        0.41336  , -0.48397  , -0.54243  , -0.27787  , -0.26015  ,
       -0.38485  ,  0.78656  ,  0.1023   , -0.20712  ,  0.40751  ,
        0.32026  , -0.51052  ,  0.48362  , -0.0099498, -0.38685  ,
        0.034975 , -0.167    ,  0.4237   , -0.54164  , -0.30323  ,
       -0.36983  ,  0.082836 , -0.52538  , -0.064531 , -1.398    ,
       -0.14873  , -0.35327  , -0.1118   ,  1.0912   ,  0.095864 ,
       -2.8129   ,  0.45238  ,  0.46213  ,  1.6012   , -0.20837  ,
       -0.27377  ,  0.71197  , -1.0754   , -0.046974 ,  0.67479  ,
       -0.065839 ,  0.75824  ,  0.39405  ,  0.15507  , -0.64719  ,
        0.32796  , -0.031748 ,  0.52899  , -0.43886  ,  0.6740

In [41]:
# create embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [42]:
embedding_matrix.shape

(39086, 100)

# Named Entity Recognition

In [None]:
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [None]:
import spacy
from spacy import displacy

In [None]:
NER = spacy.load('en_core_web_sm')

In [None]:
text = 'Mark Zuckerberg is one of the founders of Facebook, a company from the United States'

In [None]:
ner_text = NER(text)

In [None]:
for word in ner_text.ents:
    print(word.text, word.label_)

Mark Zuckerberg PERSON
one CARDINAL
Facebook ORG
the United States GPE


In [None]:
spacy.explain('GPE')

'Countries, cities, states'

In [None]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [None]:
displacy.render(ner_text, style='ent', jupyter=True)

# Data Augmentation for Text

In [None]:
# uses
# 1. increase the dataset size by creating more samples
# 2. reduce overfitting
# 3. improve model generalization
# 4. handling imbalance dataset

In [None]:
!pip install nlpaug
!pip install sacremoses

In [2]:
import nlpaug.augmenter.word as naw

In [3]:
text = 'The quick brown fox jumps over a lazy dog'

### Synonym Replacement

In [10]:
syn_aug = naw.synonym.SynonymAug(aug_src='wordnet')
synonym_text = syn_aug.augment(text)
print('Synonym Text:', synonym_text)

Synonym Text: ['The flying brownness fox jumps over a lazy andiron']


### Random Substitution

In [11]:
sub_aug = naw.random.RandomWordAug(action='substitute')
substituted_text = sub_aug.augment(text)
print('Substituted Text:', substituted_text)

Substituted Text: ['_ _ brown fox jumps _ a lazy dog']


### Random Deletion

In [12]:
del_aug = naw.random.RandomWordAug(action='delete')
deletion_text = del_aug.augment(text)
print('Deletion Text:', deletion_text)

Deletion Text: ['Quick brown jumps over a lazy dog']


### Random Swap

In [13]:
swap_aug = naw.random.RandomWordAug(action='swap')
swap_text = swap_aug.augment(text)
print('Swap Text:', swap_text)

Swap Text: ['The quick brown jumps fox a lazy over dog']


### Back Translation

In [15]:
# translate original text to other language (german) and convert back to english language
back_trans_aug = naw.back_translation.BackTranslationAug()
back_trans_text = back_trans_aug.augment(text)
print('Back Translated Text:', back_trans_text)

Back Translated Text: ['The speedy brown fox jumps over a lazy dog']
