In [1]:
# NLP Pipeline
## 1. Text Information
str_data = "<html><h2>What is nlp??? </h2></html> \nNatural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.\nThe study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers.\n(In this post), you will discover what natural language processing is and why it is so important.\nAfter reading this post, you will know => What natural language is and how it is different from other types of data."


In [2]:
## 2. Text Cleaning
from bs4 import BeautifulSoup
import string

def remove_html(text_data):
  soup = BeautifulSoup(text_data, 'lxml')
  return soup.get_text()

def remove_punctuation(text):
  sent = []
  for t in text.split(' '):
    no_punct = "".join([c for c in t if c not in string.punctuation])
    sent.append(no_punct)
  sentence = " ".join(s for s in sent)
  return sentence

def lower_sentence(text):
  return text.lower()

processed_text = remove_html(str_data)
rm_punc_sentence = remove_punctuation(processed_text)
sentence = lower_sentence(rm_punc_sentence)
print(sentence)

what is nlp  
natural language processing or nlp for short is broadly defined as the automatic manipulation of natural language like speech and text by software
the study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers
in this post you will discover what natural language processing is and why it is so important
after reading this post you will know  what natural language is and how it is different from other types of data


In [3]:
## 3. Tokenization & Text Lemmatization
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(sentence.strip())
tok_lem_sentence = [(token.text, token.lemma_) for token in doc]
print(tok_lem_sentence[:15])

[('what', 'what'), ('is', 'be'), ('nlp', 'nlp'), (' \n', ' \n'), ('natural', 'natural'), ('language', 'language'), ('processing', 'processing'), ('or', 'or'), ('nlp', 'nlp'), ('for', 'for'), ('short', 'short'), ('is', 'be'), ('broadly', 'broadly'), ('defined', 'define'), ('as', 'as')]


In [4]:
## 4. Removing Stopwords  : 대명사
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(len(stop_words))

lem_sentence = [token.lemma_ for token in doc]
rmv_sw_sentence = [w for w in lem_sentence if not w in stop_words]
removed_word = [w for w in lem_sentence if not w in rmv_sw_sentence]
print(lem_sentence)
print(rmv_sw_sentence)
print(removed_word)

179
['what', 'be', 'nlp', ' \n', 'natural', 'language', 'processing', 'or', 'nlp', 'for', 'short', 'be', 'broadly', 'define', 'as', 'the', 'automatic', 'manipulation', 'of', 'natural', 'language', 'like', 'speech', 'and', 'text', 'by', 'software', '\n', 'the', 'study', 'of', 'natural', 'language', 'processing', 'have', 'be', 'around', 'for', 'more', 'than', '50', 'year', 'and', 'grow', 'out', 'of', 'the', 'field', 'of', 'linguistic', 'with', 'the', 'rise', 'of', 'computer', '\n', 'in', 'this', 'post', 'you', 'will', 'discover', 'what', 'natural', 'language', 'processing', 'be', 'and', 'why', 'it', 'be', 'so', 'important', '\n', 'after', 'read', 'this', 'post', 'you', 'will', 'know', ' ', 'what', 'natural', 'language', 'be', 'and', 'how', 'it', 'be', 'different', 'from', 'other', 'type', 'of', 'datum']
['nlp', ' \n', 'natural', 'language', 'processing', 'nlp', 'short', 'broadly', 'define', 'automatic', 'manipulation', 'natural', 'language', 'like', 'speech', 'text', 'software', '\n', 's

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
## 5. Encoding
import numpy as np
dictionary = {}

def make_frequency_dict(text):
  # Count the frequency
  for word in text:
    if word not in dictionary:
      dictionary[word] = 0
    dictionary[word] += 1

make_frequency_dict(rmv_sw_sentence)
vocab_sorted = sorted(dictionary.items(), key=lambda x:x[1], reverse = True)
print(vocab_sorted)

[('natural', 5), ('language', 5), ('processing', 3), ('\n', 3), ('nlp', 2), ('post', 2), (' \n', 1), ('short', 1), ('broadly', 1), ('define', 1), ('automatic', 1), ('manipulation', 1), ('like', 1), ('speech', 1), ('text', 1), ('software', 1), ('study', 1), ('around', 1), ('50', 1), ('year', 1), ('grow', 1), ('field', 1), ('linguistic', 1), ('rise', 1), ('computer', 1), ('discover', 1), ('important', 1), ('read', 1), ('know', 1), (' ', 1), ('different', 1), ('type', 1), ('datum', 1)]


In [6]:
word_to_index = {}
i = 0

for (word, frequency) in vocab_sorted:
  # Cleaning: remove if frequency is less than 2
  if frequency > 1:
    i += 1
    word_to_index[word] = i
print(word_to_index)

{'natural': 1, 'language': 2, 'processing': 3, '\n': 4, 'nlp': 5, 'post': 6}


In [7]:
word_to_index['OOV'] = len(word_to_index) + 1
print(word_to_index)

{'natural': 1, 'language': 2, 'processing': 3, '\n': 4, 'nlp': 5, 'post': 6, 'OOV': 7}


In [8]:
encoded = []
print(rmv_sw_sentence)
for w in rmv_sw_sentence:
  encoded.append(word_to_index.get(w, word_to_index['OOV']))
print(encoded)

['nlp', ' \n', 'natural', 'language', 'processing', 'nlp', 'short', 'broadly', 'define', 'automatic', 'manipulation', 'natural', 'language', 'like', 'speech', 'text', 'software', '\n', 'study', 'natural', 'language', 'processing', 'around', '50', 'year', 'grow', 'field', 'linguistic', 'rise', 'computer', '\n', 'post', 'discover', 'natural', 'language', 'processing', 'important', '\n', 'read', 'post', 'know', ' ', 'natural', 'language', 'different', 'type', 'datum']
[5, 7, 1, 2, 3, 5, 7, 7, 7, 7, 7, 1, 2, 7, 7, 7, 7, 4, 7, 1, 2, 3, 7, 7, 7, 7, 7, 7, 7, 7, 4, 6, 7, 1, 2, 3, 7, 4, 7, 6, 7, 7, 1, 2, 7, 7, 7]


In [9]:
# Word Embedding (rather than one-hot encoding)
!pip install gensim
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess



In [10]:
sentences = [
 "Be careful not to practice your righteousness in front of others to be seen by them If you do you will have no reward from your Father in heaven",
 "So when you give to the needy do not announce it with trumpets as the hypocrites do in the synagogues and on the streets to be honored by others Truly I tell you they have received their reward in full",
 "But when you give to the needy do not let your left hand know what your right hand is doing so that your giving may be in secret Then your Father who sees what is done in secret will reward you",
 "And when you pray do not be like the hypocrites for they love to pray standing in the synagogues and on the street corners to be seen by others Truly I tell you they have received their reward in full",
 "But when you pray go into your room close the door and pray to your Father who is unseen Then your Father who sees what is done in secret will reward you",
 "And when you pray do not keep on babbling like pagans for they think they will be heard because of their many words Do not be like them for your Father knows what you need before you ask him",
 "This then is how you should pray",
 "Our Father in heaven hallowed be your name your kingdom come your will be done on earth as it is in heaven Give us today our daily bread And forgive us our debts as we also have forgiven our debtors And lead us not into temptation but deliver us from the evil one",
 "For if you forgive other people when they sin against you your heavenly Father will also forgive you",
 "But if you do not forgive others their sins your Father will not forgive your sins",
 "When you fast do not look somber as the hypocrites do for they disfigure their faces to show others they are fasting Truly I tell you they have received their reward in full",
 "But when you fast put oil on your head and wash your face so that it will not be obvious to others that you are fasting but only to your Father who is unseen and your Father who sees what is done in secret will reward you",
 "Do not store up for yourselves treasures on earth where moths and vermin destroy and where thieves break in and steal",
 "But store up for yourselves treasures in heaven where moths and vermin do not destroy and where thieves do not break in and steal For where your treasure is there your heart will be also",
 "The eye is the lamp of the body If your eyes are healthy your whole body will be full of light",
 "But if your eyes are unhealthy your whole body will be full of darkness If then the light within you is darkness how great is that darkness",
 "No one can serve two masters Either you will hate the one and love the other or you will be devoted to the one and despise the other You cannot serve both God and money",
 "Therefore I tell you do not worry about your life what you will eat or drink or about your body what you will wear Is not life more than food and the body more than clothes",
 "Look at the birds of the air they do not sow or reap or store away in barns and yet your heavenly Father feeds them Are you not much more valuable than they",
 "Can any one of you by worrying add a single hour to your life",
 "And why do you worry about clothes See how the flowers of the field grow They do not labor or spin",
 "Yet I tell you that not even Solomon in all his splendor was dressed like one of these",
 "If that is how God clothes the grass of the field which is here today and tomorrow is thrown into the fire will he not much more clothe you—you of little faith",
 "So do not worry saying What shall we eat or What shall we drink or What shall we wear",
 "For the pagans run after all these things and your heavenly Father knows that you need them",
 "But seek first his kingdom and his righteousness and all these things will be given to you as well",
 "Therefore do not worry about tomorrow for tomorrow will worry about itself Each day has enough trouble of its own"
]

In [11]:
tokenized_sentences = [simple_preprocess(sentence) for sentence in sentences]
model = Word2Vec(sentences=tokenized_sentences, vector_size=100,
                 window=5, min_count=1, workers=4)

similar_words = model.wv.most_similar('faith', topn=5)
print(f"Words most similar to 'word2vec': {similar_words}")

similarity = model.wv.similarity('faith', 'give')
print(f"Similarity between the two ward: {similarity}")

Words most similar to 'word2vec': [('lamp', 0.28284162282943726), ('is', 0.23174726963043213), ('despise', 0.2183786928653717), ('whole', 0.2105974406003952), ('evil', 0.20408344268798828)]
Similarity between the two ward: -0.15157455205917358


In [12]:
word_vector = model.wv['faith']
print(f"Vector representation of 'word2vec': {word_vector}")

vocab = list(model.wv.index_to_key)
print(f"Vocabulary: {vocab}")

Vector representation of 'word2vec': [-0.0056755  -0.00826638 -0.00915298  0.00379124 -0.00217069  0.00963383
 -0.00820941  0.00515155  0.00953387  0.0029702  -0.00565649  0.00648765
  0.00685388 -0.00756669  0.00396194 -0.00143258  0.00219368 -0.00854774
  0.00076148 -0.0060312  -0.00681356 -0.00522335 -0.00930255 -0.00935273
 -0.00565238 -0.00412352  0.00188261 -0.00974641 -0.00310949 -0.0045515
  0.00449779  0.00462821 -0.00412186  0.00852492 -0.00706794  0.00865432
 -0.00199032  0.00070295  0.00207171  0.00690386 -0.00415363 -0.00103651
  0.00485524  0.00114699  0.00081935  0.00878539 -0.00836978 -0.00603853
 -0.00695298 -0.00761339 -0.00675022 -0.00014128 -0.00110848  0.00652218
  0.00946799 -0.00053769  0.00061736 -0.0086329   0.00348209 -0.0065323
  0.00597688  0.0002868   0.004935   -0.00662508  0.00228451 -0.00908144
 -0.00731483 -0.00247767 -0.00576591 -0.00583857 -0.0058863   0.00958526
  0.0010163  -0.0017593  -0.0034525  -0.00187125 -0.0017161  -0.00441008
 -0.00483987 -0.

In [None]:
import gensim.downloader as api
pretrained_model = api.load('word2vec-google-news-300')
model_path = '/cotent/drive/MyDrive/Colab_Notebooks/Intro2AI/word2vec-google-news-300.model'
pretrained_model.save(model_path)
print(f"Model saved to {model_path}")



In [None]:
import gensim.models.keyedvectors
pretrained_model = KeyedVectors.load(model_path)
print("Model loaded successfully")