In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:80% !important;}</style>"))

In [None]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. 
NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

### Tokenization

In [None]:
import nltk
nltk.download('punkt')
from nltk import tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
tokenize.sent_tokenize(raw_txt)

['Welcome to the world of Deep Learning for NLP!',
 "We're in this together, and we'll learn together.",
 'NLP is amazing, and Deep Learning makes it even more fun.',
 "Let's learn!"]

In [None]:
txt_sents = tokenize.sent_tokenize(raw_txt)

In [None]:
type(txt_sents), len(txt_sents)

(list, 4)

In [None]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]
type(txt_words), type(txt_words[0])

(list, list)

In [None]:
print(txt_words[:2])

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP', '!'], ['We', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


### Normalizing case

In [None]:
#You needn't run this
raw_txt = raw_txt.lower()

In [None]:
txt_sents = [sent.lower() for sent in txt_sents]
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [None]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

In [None]:
print(txt_words[:2])

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp', '!'], ['we', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


### Removing punctuation

In [None]:
from string import punctuation

In [None]:
list_punct = list(punctuation)
print(list_punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [None]:
def drop_punct(input_tokens):
    return [token for token in input_tokens if token not in list_punct]

In [None]:
drop_punct(["let",".","us",".","go","!"])

['let', 'us', 'go']

In [None]:
txt_words_nopunct = [drop_punct(sent) for sent in txt_words]
print(txt_words_nopunct)

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp'], ['we', "'re", 'in', 'this', 'together', 'and', 'we', "'ll", 'learn', 'together'], ['nlp', 'is', 'amazing', 'and', 'deep', 'learning', 'makes', 'it', 'even', 'more', 'fun'], ['let', "'s", 'learn']]


#### Removing stop words

In [None]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
list_stop = stopwords.words("english")
len(list_stop)

179

In [None]:
print(list_stop[:50])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be']


In [None]:
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in list_stop]

In [None]:
txt_words_nostop = [drop_stop(sent) for sent in txt_words_nopunct]

In [None]:
print(txt_words_nostop[0])

['welcome', 'world', 'deep', 'learning', 'nlp']


### Stemming

In [None]:
from nltk.stem import PorterStemmer

In [None]:
stemmer_p = PorterStemmer()

In [None]:
print(stemmer_p.stem("driving"))

drive


In [None]:
txt = "I mustered all my drive, drove to the driving school!"

In [None]:
tokens = tokenize.word_tokenize(txt)
print([stemmer_p.stem(word) for word in tokens])

['I', 'muster', 'all', 'my', 'drive', ',', 'drove', 'to', 'the', 'drive', 'school', '!']


### Lemmatization

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize("ponies")

'pony'

### Stemming Our Data

In [None]:
from nltk.stem import PorterStemmer

In [None]:
stemmer_p = PorterStemmer()

In [None]:
print([stemmer_p.stem(token) for token in txt_words_nostop[0]])

['welcom', 'world', 'deep', 'learn', 'nlp']


Applying stemmer to all the sentences

In [None]:
txt_words_stem = [[stemmer_p.stem(token) for token in sent] for sent in txt_words_nostop]

In [None]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

## Representation

### Creating One-Hot Encoding for Our Data

In [None]:
print(txt_words_nostop)

[['welcome', 'world', 'deep', 'learning', 'nlp'], ["'re", 'together', "'ll", 'learn', 'together'], ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'], ['let', "'s", 'learn']]


In [None]:
target_terms = ["nlp","deep","learn"]

In [None]:
def get_onehot(sent):
    return [1 if term in  sent else 0 for term in target_terms]

In [None]:
one_hot_mat = [get_onehot(sent) for sent in txt_words_nostop]

In [None]:
import numpy as np

In [None]:
np.array(one_hot_mat)

array([[1, 1, 0],
       [0, 0, 1],
       [1, 1, 0],
       [0, 0, 1]])

### Term Frequencies

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(max_features = 5)

In [None]:
vectorizer.fit(txt_sents)

CountVectorizer(max_features=5)

In [None]:
vectorizer.vocabulary_

{'and': 0, 'deep': 1, 'learn': 2, 'together': 3, 'we': 4}

In [None]:
txt_dtm = vectorizer.fit_transform(txt_sents)

In [None]:
txt_dtm.toarray()

array([[0, 1, 0, 0, 0],
       [1, 0, 1, 2, 2],
       [1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0]])

In [None]:
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [None]:
def do_nothing(doc):
    return doc

In [None]:
vectorizer = CountVectorizer(max_features=5, 
                             preprocessor=do_nothing, 
                             tokenizer=do_nothing)

In [None]:
txt_dtm = vectorizer.fit_transform(txt_words_stem)

In [None]:
txt_dtm.toarray()

array([[0, 1, 1, 1, 0],
       [1, 0, 1, 0, 2],
       [0, 1, 1, 1, 0],
       [0, 0, 1, 0, 0]])

In [None]:
vectorizer.vocabulary_

{"'ll": 0, 'deep': 1, 'learn': 2, 'nlp': 3, 'togeth': 4}

In [None]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

### Document Term Matrix with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer_tfidf = TfidfVectorizer(max_features=5)

In [None]:
vectorizer_tfidf.fit(txt_sents)

TfidfVectorizer(max_features=5)

In [None]:
vectorizer_tfidf.vocabulary_

{'and': 0, 'deep': 1, 'learn': 2, 'together': 3, 'we': 4}

In [None]:
txt_tfidf = vectorizer_tfidf.transform(txt_sents)

In [None]:
txt_tfidf.toarray()

array([[0.        , 1.        , 0.        , 0.        , 0.        ],
       [0.25932364, 0.        , 0.25932364, 0.65783832, 0.65783832],
       [0.70710678, 0.70710678, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ]])

In [None]:
vectorizer_tfidf.idf_

array([1.51082562, 1.51082562, 1.51082562, 1.91629073, 1.91629073])

## Training Our Own Embeddings

In [None]:
import gensim.downloader as api
from gensim.models import word2vec

In [None]:
#dataset = word2vec.Text8Corpus("text8")
#Another way of loading the data. if this doesn't work, you could use the text8 corpus local file
dataset = api.load("text8")



To ensure reproducible results, set random seed as 1

In [None]:
np.random.seed(1)

In [None]:
model = word2vec.Word2Vec(dataset)

In [None]:
print(model.wv["animal"])

[ 1.1026119  -1.3046589   0.70341945 -0.86909544 -0.751878   -0.26030096
 -1.1999269   1.2992241  -1.5108328  -0.19480833  1.0370378  -0.17457986
  2.0280316   0.49902165 -0.03292564 -1.8114314  -0.21682471 -0.05561074
 -0.8004332   1.5435028  -0.7652134   1.3158232   1.338854   -1.5350536
  0.6931977  -0.72015655  0.0849009   1.8945218  -1.7990437   2.1330843
 -2.690658   -1.3606167  -1.167938    0.08357526 -1.2912241   0.16515994
  0.8229116   1.8972722   1.9830761  -1.0277427   0.46957147  3.7972617
  3.2152545  -1.1564319  -1.0044686   1.1819247  -0.64589125 -0.03240976
  0.11076579  0.62604517  2.3929238   1.3312353   1.769057    1.6800108
 -0.708076   -0.0801698   0.93380237 -0.01668704  0.74874663 -1.3614036
  1.1550819   0.5030792  -1.4462464   1.6633688  -0.02611849  0.8749994
  0.00849956 -1.0964893  -0.5647828  -1.6154531  -0.35844564  1.5210787
 -1.0849271   1.3545982  -0.2932061  -0.7673198  -1.0533078  -1.0078735
  1.0722871  -0.08011077 -0.8076661  -0.4829715   0.3295717

In [None]:
len(model.wv["animal"])

100

In [None]:
model.wv.most_similar("animal")

[('insect', 0.7279007434844971),
 ('animals', 0.7273350954055786),
 ('aquatic', 0.6705514192581177),
 ('feces', 0.6685516834259033),
 ('humans', 0.6573395729064941),
 ('sentient', 0.6477888822555542),
 ('human', 0.6449308395385742),
 ('eating', 0.6439002752304077),
 ('insects', 0.6391392946243286),
 ('herd', 0.6378552913665771)]

In [None]:
model.wv.most_similar("happiness")

[('humanity', 0.7750765681266785),
 ('goodness', 0.7538900971412659),
 ('perfection', 0.7263875007629395),
 ('pleasure', 0.7254328727722168),
 ('righteousness', 0.7230576872825623),
 ('salvation', 0.7225526571273804),
 ('compassion', 0.719607949256897),
 ('desires', 0.7146188020706177),
 ('dignity', 0.7069608569145203),
 ('conscious', 0.7045603394508362)]

### Semantic Regularities in Word Embeddings

In [None]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.663352370262146),
 ('elizabeth', 0.6081100702285767),
 ('throne', 0.6074048280715942),
 ('prince', 0.5996178388595581),
 ('empress', 0.5897589921951294)]

In [None]:
model.wv.most_similar(positive=['uncle', 'woman'], negative=['man'], topn=5)

[('grandmother', 0.8136803507804871),
 ('wife', 0.8000144362449646),
 ('aunt', 0.7967590689659119),
 ('niece', 0.7898036241531372),
 ('widow', 0.7703402042388916)]

### Vectors for Phrases

In [None]:
v1 = model.wv['get']
v2 = model.wv['happy']
res1 = (v1+v2)/2

In [None]:
v1 = model.wv['make']
v2 = model.wv['merry']
res2 = (v1+v2)/2

In [None]:
model.wv.cosine_similarities(res1, [res2])

array([0.5595141], dtype=float32)

### Effect of Parameters - 'size' of the Vector

In [None]:
model = word2vec.Word2Vec(dataset, size=30)

In [None]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('emperor', 0.8343724012374878),
 ('prince', 0.8293878436088562),
 ('throne', 0.792656421661377),
 ('empress', 0.7858843803405762),
 ('consul', 0.7856160998344421)]

### Effect of parameters - skipgram vs. CBOW

#### Rare terms - oeuvre

In [None]:
model = word2vec.Word2Vec(dataset)

In [None]:
model.wv.most_similar("oeuvre", topn=5)

[('ballets', 0.7220040559768677),
 ('fidei', 0.7210460305213928),
 ('phrasing', 0.7189481258392334),
 ('baglione', 0.716771125793457),
 ('elegiac', 0.7140164375305176)]

In [None]:
model_sg = word2vec.Word2Vec(dataset, sg=1)

In [None]:
model_sg.wv.most_similar("oeuvre", topn=5)

[('masterful', 0.831885039806366),
 ('librettos', 0.8253860473632812),
 ('inklings', 0.8242233395576477),
 ('enthralled', 0.8157076835632324),
 ('auteurs', 0.8127604722976685)]

### Training Word Vectors on Different Datasets

In [None]:
nltk.download('brown')
nltk.download('movie_reviews')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
from nltk.corpus import brown, movie_reviews

In [None]:
model_brown = word2vec.Word2Vec(brown.sents(), sg=1)
model_movie = word2vec.Word2Vec(movie_reviews.sents(), sg=1)

In [None]:
model_brown.wv.most_similar('money', topn=5)

[('care', 0.8494411706924438),
 ('job', 0.8448654413223267),
 ('friendship', 0.839860200881958),
 ('permission', 0.8297990560531616),
 ('chances', 0.8259839415550232)]

In [None]:
model_movie.wv.most_similar('money', topn=5)

[('cash', 0.746373176574707),
 ('risk', 0.7214367389678955),
 ('ransom', 0.7115482091903687),
 ('pay', 0.7035477161407471),
 ('bucks', 0.6899775862693787)]

### Using Pre-Trained Word Vectors

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
# Go to: https://nlp.stanford.edu/data/glove.6B.zip to download the pre-tained files

glove_input_file = '/content/glove.6B.100d.txt'
word2vec_output_file = '/content/glove.6B.100d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [None]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("/content/glove.6B.100d.w2vformat.txt", binary=False)

In [None]:
glove_model.most_similar("money", topn=5)

[('funds', 0.8508071303367615),
 ('cash', 0.848483681678772),
 ('fund', 0.7594833374023438),
 ('paying', 0.7415367364883423),
 ('pay', 0.7407673001289368)]

In [None]:
glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755735874176025),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520534753799438)]

### Bias in Embeddings – A Word of Caution

In [None]:
model.wv.most_similar(positive=['woman', 'doctor'], negative=['man'], topn=5)

[('emmy', 0.7654274702072144),
 ('teacher', 0.7627977728843689),
 ('prizes', 0.7060180306434631),
 ('banquet', 0.7034530639648438),
 ('ig', 0.6995185613632202)]

In [None]:
model.wv.most_similar(positive=['woman', 'smart'], negative=['man'], topn=5)

[('stickers', 0.7511711120605469),
 ('combo', 0.7458405494689941),
 ('turntables', 0.7283222675323486),
 ('fancy', 0.7235152721405029),
 ('bowlers', 0.7217631340026855)]