<a href="https://colab.research.google.com/github/apoorvakarnwal/google-colab/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BAG OF WORDS MODEL**

In [1]:
class Category:
  BOOKS="BOOKS"
  CLOTHING="CLOTHING"

train_x= ["I love the book", "This is a great book", "The fit is great", "I love the shoes"]
train_y= [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

In [2]:
#unigram (tokenise 1 word)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer= CountVectorizer(binary=True)
X=vectorizer.fit_transform(train_x)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['book' 'fit' 'great' 'is' 'love' 'shoes' 'the' 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


In [3]:
from sklearn import svm
clf_svm= svm.SVC(kernel='linear')
clf_svm.fit(X, train_y)

test_x= vectorizer.transform(["I like reading the book"])
clf_svm.predict(test_x)

array(['BOOKS'], dtype='<U8')

In [4]:
#bigram (tokenise 2 word)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer= CountVectorizer(binary=True, ngram_range=(2,2))
X=vectorizer.fit_transform(train_x)
print(vectorizer.get_feature_names_out())
print(X.toarray())

from sklearn import svm
clf_svm= svm.SVC(kernel='linear')
clf_svm.fit(X, train_y)

test_x= vectorizer.transform(["I like reading the book"])
clf_svm.predict(test_x)

['fit is' 'great book' 'is great' 'love the' 'the book' 'the fit'
 'the shoes' 'this is']
[[0 0 0 1 1 0 0 0]
 [0 1 1 0 0 0 0 1]
 [1 0 1 0 0 1 0 0]
 [0 0 0 1 0 0 1 0]]


array(['BOOKS'], dtype='<U8')

# **WORD VECTORS**

In [31]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
import spacy

nlp=spacy.load("en_core_web_md")

In [7]:
print(train_x)

['I love the book', 'This is a great book', 'The fit is great', 'I love the shoes']


In [27]:
docs = [nlp(text) for text in train_x]
train_x_word_vec= [doc.vector for doc in docs]
clf_svm_word_vec= svm.SVC(kernel='linear')
clf_svm_word_vec.fit(train_x_word_vec, train_y)

In [30]:
test_x = ["I love story books"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors = [x.vector for x in test_docs]
clf_svm_word_vec.predict(test_x_word_vectors)

array(['BOOKS'], dtype='<U8')

# **STEMMING/LEMMATIZATION**

In [6]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer= PorterStemmer()

phrase="reading the books of stories"

words= word_tokenize(phrase)
print(words)

stemmed_words=[]
for word in words:
  stemmed_words.append(stemmer.stem(word))

print(stemmed_words)

['reading', 'the', 'books', 'of', 'stories']
['read', 'the', 'book', 'of', 'stori']


In [18]:
from nltk.stem import WordNetLemmatizer

lemmatizer= WordNetLemmatizer()

phrase="reading the books of stories"

words= word_tokenize(phrase)

print(words)

lemmatized_words=[]
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word, pos='n'))
  #lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

print(lemmatized_words)

['reading', 'the', 'books', 'of', 'stories']
['reading', 'the', 'book', 'of', 'story']


# **Stopwords Removal**
The a set of most common words in english: This, that, he, it

In [25]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# print(stopwords.fileids())

stop_words= stopwords.words('english')

phrase='This is an example sentence that demonstrate stopwords removal.'

words=word_tokenize(phrase)

filtered_words=[]
for w in words:
  if w not in stop_words:
    filtered_words.append(w)

print(filtered_words)

['This', 'example', 'sentence', 'demonstrate', 'stopwords', 'removal', '.']


# **Spell Correction, sentiment & POS taging**


In [29]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [35]:
from textblob import TextBlob

phrase= "helo, i am a nic persan"

tb_phrase = TextBlob(phrase)
tb_spellChecked=tb_phrase.correct()
print(tb_spellChecked)


print(tb_phrase.tags)

print(tb_spellChecked.sentiment)


held, i am a nice person
[('helo', 'NN'), ('i', 'NN'), ('am', 'VBP'), ('a', 'DT'), ('nic', 'JJ'), ('persan', 'NN')]
Sentiment(polarity=0.6, subjectivity=1.0)


# **Transformers Architecture**

In [6]:
!pip install spacy-transformers
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import spacy
import torch

nlp=spacy.load("en_core_web_md")

doc= nlp("Here is some text to encode.")


In [2]:
class Category:
  BOOKS="BOOKS"
  BANK="BANK"

train_x= ["good character and plot progression", "check out the book", "good story would recommend", "novel recommendation", "need to make a deposit", "balance inquiry", "save money"]
train_y= [Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BOOKS, Category.BANK, Category.BANK, Category.BANK]

In [3]:
from sklearn import svm

docs = [nlp(text) for text in train_x]
train_x_vectors = [doc.vector for doc in docs]

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

test_x = ["books"]
docs_test = [nlp(text) for text in test_x]
test_x_vector = [doc.vector for doc in docs_test]

prediction = clf_svm.predict(test_x_vector)
print(prediction)

['BOOKS']
