In [None]:
#9A
''' 
Aim: Implement Naive Bayes classifier.'''

import pandas as pd
import numpy as np
import re
import nltk

nltk.download('stopwords')
# Load the data
sms_data = pd.read_csv("C:/Users/Artophilic/Datascience Bootcamp/Practical/NLP_pract/spam.csv", encoding='latin-1')
# Rename columns if necessary
sms_data.rename(columns={'v1': 'Category', 'v2': 'Message'}, inplace=True)

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemming = PorterStemmer()
corpus = []
for i in range(len(sms_data)):
    s1 = re.sub('[^a-zA-Z]', ' ', sms_data['Message'][i])
    s1 = s1.lower()
    s1 = s1.split()
    s1 = [stemming.stem(word) for word in s1 if word not in set(stopwords.words('english'))]
    s1 = ' '.join(s1)
    corpus.append(s1)

from sklearn.feature_extraction.text import CountVectorizer
countvectorizer = CountVectorizer()
x = countvectorizer.fit_transform(corpus).toarray()
print(x)
y = sms_data['Category'].values
print(y)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=2)
# Multinomial Naïve Bayes
from sklearn.naive_bayes import MultinomialNB
multinomialnb = MultinomialNB()
multinomialnb.fit(x_train, y_train)
# Predicting on test data
y_pred = multinomialnb.predict(x_test)
print(y_pred)
# Results of our Models
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_pred))
print("accuracy_score: ", accuracy_score(y_test, y_pred))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99      1448
        spam       0.89      0.92      0.91       224

    accuracy                           0.97      1672
   macro avg       0.94      0.95      0.95      1672
weighted avg       0.98      0.97      0.98      1672

accuracy_score:  0.9748803827751196


In [None]:
#9B
''' 
Aim: Speech tagging
It assigns grammatical labels to words in a sentences.
pos_tag() function assigns POS tags'''


#1. Speech Tagging using spaCy
import spacy 
import spacy.attrs #Contains integer values for linguistic features like POS,TAG


nlp = spacy.load("en_core_web_sm") #A small english model trained on common texts
sp = spacy.load('en_core_web_sm') 

#Processes the string into a spacy Doc Object
sen = sp(u"I like to play cricket. I hated it in my childhood though") 
print(sen.text) #Prints the original sentences


print(sen[7].pos_) #Refers to the 8th token (word) in the sentence.
print(sen[7].tag_) #Coarse-grained POS tag (like VERB).
print(spacy.explain(sen[7].tag_)) #Fine-grained POS tag (like VBD = verb, past tense).
#It gives human-readable explanation of the tag.

#Looping through all words for POS
''' 
It prints each word, its POS,TAG and explain, nicely aligned using f string.
'''
for word in sen:
    print(f'{word.text:{12}}{word.pos_:{10}}{word.tag_:{8}}{spacy.explain(word.tag_)}')

#Google as verb
sen = sp(u'Can you google it?')
word = sen[2]
print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')

#Google as noun
sen = sp(u'Can you search it on google?')
word = sen[5]
print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')

#Finding the number of POS tags
sen = sp(u"I like to play football. I hated it in my childhood though")
num_pos = sen.count_by(spacy.attrs.POS) #Counting frequency of each POS in the sentence.

''' 
sen.vocab[k].text: Converts the integer POS ID to text (like VERB, NOUN).
v: Frequency of that POS in the sentence.'''
for k,v in sorted(num_pos.items()):
    print(f'{k}. {sen.vocab[k].text:{8}}: {v}')

#Visualizing parts of speech tags
from spacy import displacy

''' 
u before string: It refers to Unicode string.'''
sen = sp(u"I like to play football. I hated it in my childhood though")

#Use 'displacy.render' instead of 'displacy.serve'
''' 
displacy.render(...): Visually shows how words relate grammatically (dependencies).
style='dep': Shows dependency parse.
options={'distance': 120}: Adjusts spacing between words for better visibility'''
displacy.render(sen, style='dep', jupyter=True, options={'distance': 120})

I like to play cricket. I hated it in my childhood though
VERB
VBD
verb, past tense
I           PRON      PRP     pronoun, personal
like        VERB      VBP     verb, non-3rd person singular present
to          PART      TO      infinitival "to"
play        VERB      VB      verb, base form
cricket     NOUN      NN      noun, singular or mass
.           PUNCT     .       punctuation mark, sentence closer
I           PRON      PRP     pronoun, personal
hated       VERB      VBD     verb, past tense
it          PRON      PRP     pronoun, personal
in          ADP       IN      conjunction, subordinating or preposition
my          PRON      PRP$    pronoun, possessive
childhood   NOUN      NN      noun, singular or mass
though      SCONJ     IN      conjunction, subordinating or preposition
google       VERB       VB       verb, base form
google       PROPN      NNP      noun, proper singular
85. ADP     : 1
92. NOUN    : 2
94. PART    : 1
95. PRON    : 4
97. PUNCT   : 1
98. SCONJ   : 1


In [None]:
#2. Speech tagging using nktl
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
''' 
state_union: Preloaded speeches.
PunktSentenceTokenizer: A pre-trained unsupervised sentence tokenizer.'''

# Download the 'state_union' corpus
nltk.download('state_union')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Create our training and testing data:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

# Train the Punkt tokenizer:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
# Tokenize:
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
    try:
        for i in tokenized[:2]: # Properly indented loop
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
# Call the function once (removed recursive call inside itself)
process_content()


[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [None]:
#