In [None]:
#Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB    #for using the Multinomial Naive Bayes Classifier Model 
from sklearn.model_selection import train_test_split  

In [None]:
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import wordnet

In [None]:
#downloading necessary data from NLTK
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
data = pd.read_csv('bbc_text_cls.csv')

In [None]:
data.head()

In [None]:
X = data['text'] #variables
Y = data['labels'] #labels

In [None]:
Y.hist(figsize=(10,5)); #shows the data is balanced

In [None]:
train,test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.25,random_state = 0)

In [None]:
vectorizer = CountVectorizer()

In [None]:
X_train = vectorizer.fit_transform(train)
X_test = vectorizer.transform(test)

In [None]:
#now the text has been converted into numbers and vectors in the form of sparse matrix
X_train  

In [None]:
(X_train!=0).sum() #gives the total number of non zero numbers in the sparse matrix

In [None]:
X_train.shape #gives the number of rows and columns

In [None]:
np.prod(X_train.shape) #gives product ie rows*columns which is total number of elements

In [None]:
#what percentage of values are non-zero
(X_train!=0).sum()/np.prod(X_train.shape)

In [None]:
classifier = MultinomialNB()
classifier.fit(X_train,Y_train)

In [None]:
print("train score:",classifier.score(X_train,Y_train))
print("test score:",classifier.score(X_test,Y_test)) 
#this is the score we got without applying stopwords or lemmatizing or stemming

In [None]:
#with stopwords
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train)
X_test = vectorizer.transform(test)
classifier = MultinomialNB()
classifier.fit(X_train,Y_train)
print("train score:",classifier.score(X_train,Y_train))
print("test score:",classifier.score(X_test,Y_test))

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self,doc):
        tokens = word_tokenize(doc) #tokenization
        words_and_tags = nltk.pos_tag(tokens)
        return [self.wnl.lemmatize(word,pos=get_wordnet_pos(tag))\
                for word,tag in words_and_tags]
    

In [None]:
#with Lemmatizer
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer())
X_train = vectorizer.fit_transform(train)
X_test = vectorizer.transform(test)
classifier = MultinomialNB()
classifier.fit(X_train,Y_train)
print("train score:",classifier.score(X_train,Y_train))
print("test score:",classifier.score(X_test,Y_test))