In [22]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# BAG OF WORDS

In [41]:
text = [
    'Call be Selbie, the ruthless ruler of the dangerous city, Bermingham.',
    'I like playing call of duty a lot.',
    'I was singing twinkle twinkle little star'
]
print(text)

['Call be Selbie, the ruthless ruler of the dangerous city, Bermingham.', 'I like playing call of duty a lot.', 'I was singing twinkle twinkle little star']


In [42]:
def preprocess_text(list_of_texts):
    
    #lowercase the text
    text = list_of_texts.lower()
    text = re.sub(r'[^a-zA-Z0-9]',' ', text)
    
    tokens = word_tokenize(text)
    tokens = [i_token for i_token in tokens if i_token not in stopwords.words("english")]
    #stemmer
    stemmed = [PorterStemmer().stem(i_token) for i_token in tokens]
    
    #appy lemmatizer
    lemmtized = [WordNetLemmatizer().lemmatize(i_token, pos="n") for i_token in stemmed]
    lemmtized = [WordNetLemmatizer().lemmatize(i_token, pos="v") for i_token in lemmtized]
    
    return lemmtized

In [43]:
text = " ".join(text)
print(text)

Call be Selbie, the ruthless ruler of the dangerous city, Bermingham. I like playing call of duty a lot. I was singing twinkle twinkle little star


In [44]:
tokens = preprocess_text(text)

In [45]:
tokens

['call',
 'selbi',
 'ruthless',
 'ruler',
 'danger',
 'citi',
 'bermingham',
 'like',
 'play',
 'call',
 'duti',
 'lot',
 'sing',
 'twinkl',
 'twinkl',
 'littl',
 'star']

In [46]:
vocabularies = np.unique(tokens)

In [47]:
vocabularies

array(['bermingham', 'call', 'citi', 'danger', 'duti', 'like', 'littl',
       'lot', 'play', 'ruler', 'ruthless', 'selbi', 'sing', 'star',
       'twinkl'], dtype='<U10')

# Document Term Matrix

In [54]:
#document term matrix
def document_term_matrix(list_of_new_sentence, vocabularies):
    #Initializing an empty list for all the document term matrix
    list_of_document_term_matrix = []
    
    for i_new_sentence in list_of_new_sentence:
        
        #preprocess the sentence
        new_tokens = preprocess_text(i_new_sentence)
        
        #initializing a zeros array
        dtm = np.zeros(len(vocabularies))
        
        for i_token in new_tokens:
            for index, i_vocab in enumerate(vocabularies):
                if i_vocab == i_token:
                    dtm[index] += 1
        
        #append the document term matrix to the list of document term matrix
        list_of_document_term_matrix.append(dtm)
    
    #convert to dataframe
    dtn = pd.DataFrame(data=list_of_document_term_matrix, index = [list_of_new_sentence], columns= vocabularies)
    
    return dtn

In [55]:
list_of_new_sentences = [
    'Call be Selbie, the ruthless ruler of the dangerous city, Bermingham.',
    'I like playing call of duty a lot.',
    'I was singing twinkle twinkle little star'
]
print(list_of_new_sentences)

['Call be Selbie, the ruthless ruler of the dangerous city, Bermingham.', 'I like playing call of duty a lot.', 'I was singing twinkle twinkle little star']


In [56]:
stm = document_term_matrix(list_of_new_sentences, vocabularies)

In [57]:
stm

Unnamed: 0,bermingham,call,citi,danger,duti,like,littl,lot,play,ruler,ruthless,selbi,sing,star,twinkl
"Call be Selbie, the ruthless ruler of the dangerous city, Bermingham.",1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
I like playing call of duty a lot.,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
I was singing twinkle twinkle little star,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0


# TF-IDF

In [60]:
#cOPYING THE DOCUMENT-TERM MATRIX
tfidf = stm.copy()

#divide each item in a column to its corresponding document frequency
for index_i, i_col_name in enumerate(tfidf.columns):
    i_col = tfidf.iloc[:, index_i]
    i_col_sum = sum(i_col)
    
    #iterate through items in a column
    for index_2, i_tem in enumerate(i_col):
        tfidf.iloc[index_2, index_i] /= i_col_sum

In [61]:
tfidf

Unnamed: 0,bermingham,call,citi,danger,duti,like,littl,lot,play,ruler,ruthless,selbi,sing,star,twinkl
"Call be Selbie, the ruthless ruler of the dangerous city, Bermingham.",1.0,0.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
I like playing call of duty a lot.,0.0,0.5,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
I was singing twinkle twinkle little star,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


# One Hot Encoding

In [64]:
def one_hot_encoding(vocabularies):
    
    #Initializing a zeros array
    one_hot_encoded = np.zeros([len(vocabularies), len(vocabularies)])
    
    #convert to dataframe
    one_hot_encoded = pd.DataFrame(data = one_hot_encoded, index = [vocabularies], columns = [vocabularies])
    
    #Set 1 if the word exist in the corresponding place
    for i_vocab in vocabularies:
        one_hot_encoded.loc[i_vocab,i_vocab] = 1
    return one_hot_encoded

In [65]:
one_hot_encoded = one_hot_encoding(vocabularies)

In [66]:
one_hot_encoded

Unnamed: 0,bermingham,call,citi,danger,duti,like,littl,lot,play,ruler,ruthless,selbi,sing,star,twinkl
bermingham,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
call,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
citi,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
danger,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
duti,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
like,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
littl,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
play,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
ruler,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [67]:
one_hot_encoded.loc["like"].values

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

# Word Embedding Using Word2Vec

In [94]:
from urllib import request
import bs4
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [75]:
!pip install html5lib
!pip install lxml



In [77]:
data = request.urlopen("https://en.wikipedia.org/wiki/Artificial_intelligence").read()

#improve the html tags
data = bs4.BeautifulSoup(data, "html.parser")

#get all paragraphs
paragraphs = data.find_all("p")

#convert list of bs4 lines into list of strings
article = [i.text.strip() for i in paragraphs]

In [78]:
preprocessed_tokens = [preprocess_text(i) for i in article]

In [79]:
preprocessed_tokens

[[],
 ['artifici',
  'intellig',
  'ai',
  'intellig',
  'perceiv',
  'synthes',
  'infer',
  'inform',
  'demonstr',
  'machin',
  'oppos',
  'intellig',
  'display',
  'non',
  'human',
  'anim',
  'human',
  'exampl',
  'task',
  'do',
  'includ',
  'speech',
  'recognit',
  'comput',
  'vision',
  'translat',
  'natur',
  'languag',
  'well',
  'map',
  'input'],
 ['ai',
  'applic',
  'includ',
  'advanc',
  'web',
  'search',
  'engin',
  'e',
  'g',
  'googl',
  'search',
  'recommend',
  'system',
  'use',
  'youtub',
  'amazon',
  'netflix',
  'understand',
  'human',
  'speech',
  'siri',
  'alexa',
  'self',
  'drive',
  'car',
  'e',
  'g',
  'waymo',
  'autom',
  'decis',
  'make',
  'compet',
  'highest',
  'level',
  'strateg',
  'game',
  'system',
  'chess',
  'go',
  '1'],
 ['machin',
  'becom',
  'increasingli',
  'capabl',
  'task',
  'consid',
  'requir',
  'intellig',
  'often',
  'remov',
  'definit',
  'ai',
  'phenomenon',
  'know',
  'ai',
  'effect',
  '2',
  

In [85]:
#Apply word2Vec (cBOW) to our processed data
word2vec = Word2Vec(preprocessed_tokens, 
                    sg=0,
                    vector_size = 200,
                    window = 5,
                    min_count = 2,
                    epochs = 5)

In [88]:
#Get all vocabularies
vocabularies = word2vec.wv

In [89]:
v1 = word2vec.wv['artifici']

In [90]:
v1

array([ 4.64094745e-04, -5.08764107e-03,  1.66370824e-03,  1.60986197e-03,
        5.83336502e-03, -1.46264001e-03, -2.08575255e-03,  7.54519599e-03,
        1.23912847e-04,  1.33767282e-03, -3.92607506e-03, -6.21260516e-03,
       -4.42999974e-03,  6.01584511e-03, -4.90511674e-03,  2.58610561e-03,
       -5.08543430e-03,  3.27822141e-04, -9.17160767e-04, -7.47374352e-03,
        5.19167492e-03,  2.16707564e-03,  1.49909325e-03,  2.43359199e-03,
        2.84425594e-04,  2.92097381e-03,  2.93877302e-03, -7.26343598e-03,
       -3.64324567e-03,  1.09590546e-04,  2.72129313e-03, -1.95262057e-03,
        1.59570505e-03, -3.87849961e-03,  1.71696465e-03,  3.46128689e-03,
        3.92658031e-03, -2.78212712e-03, -5.06014377e-03, -7.04482058e-03,
       -2.02313624e-03, -3.63752292e-03, -2.63935793e-03,  4.15577367e-03,
        6.75959978e-03,  1.21441507e-03, -3.70365242e-03, -3.59735801e-03,
        5.30667976e-03,  2.97671603e-03, -1.46186817e-03, -4.19936283e-03,
       -1.70419959e-03,  

In [91]:
sim_words = word2vec.wv.similar_by_word("intellig",topn=10)

In [92]:
sim_words

[('ai', 0.4771723747253418),
 ('human', 0.4464598596096039),
 ('use', 0.4456796646118164),
 ('input', 0.4322156310081482),
 ('comput', 0.4079076945781708),
 ('may', 0.39933812618255615),
 ('agent', 0.3831430673599243),
 ('network', 0.3827890455722809),
 ('mind', 0.3808494806289673),
 ('would', 0.3688828647136688)]

In [93]:
#Get words that are similar to human but dissimilar to machine
result = word2vec.wv.most_similar(positive=["human"], negative = ["machin"])
result

[('whether', 0.2402685135602951),
 ('futur', 0.22539667785167694),
 ('artifici', 0.20639830827713013),
 ('race', 0.20545828342437744),
 ('inspir', 0.18519648909568787),
 ('capabl', 0.18086472153663635),
 ('person', 0.16968011856079102),
 ('comput', 0.1606058031320572),
 ('feel', 0.15891115367412567),
 ('discoveri', 0.15303699672222137)]

In [110]:
# #Applying PCA to our applied vectors

# x = word2vec.wv

# #Applying  PCA
# pca = PCA(n_components=2)
# result = pca.fit_transform(x)

# plt.figure(figsize=(50,50))
# plt.scatter(result[:,0],result[:,1])
# words = list(word2vec.wv)

# for i, word in enumerate(words):
#     plt.annotate(word, xy = (result[i,0], result[i,1]))
