## Count Vectiorizer Code
1. Create the model
model = WhateverModelYouLike()

2. Train / Fit the model
model.fit(X_train, Y_train)

3. Check the model's performance
model.score(X_train, Y_train)
model.score(X_test, Y_test)


In [1]:
# Dependencies
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet


In [2]:
# Downloads
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /home/carlosm/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/carlosm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/carlosm/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv


In [None]:
# Creating dataset

df = pd.read_csv('bbc_text_cls.csv')
df.head()

In [8]:
# Separating columns (inputs, targets)

inputs = df['text']
labels = df['labels']

In [None]:
# Visualizing the label's values frequency

labels.hist(figsize=(10, 5))

In [6]:
# Getting training / testings datasets

inputs_train, inputs_test, Y_train, Y_test = train_test_split(
    inputs, labels, random_state=123
)


In [13]:
# Creating vectorizer Object

vectorizer = CountVectorizer()

In [14]:
# Creating X as vectors

X_train = vectorizer.fit_transform(inputs_train) # "fit" learns the vocabulary for training
X_test = vectorizer.transform(inputs_test) # without "fit" uses the vocabulary learned in training


In [None]:
(X_train != 0).sum()

In [17]:
# What percentage of values are non-zero (sparse metric)

(X_train != 0).sum() / (np.prod(X_train.shape))

# If close to zero Sparse representation is OK

0.007695239935415004

In [23]:
# Modeling and testing

model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))

Train score:  0.9922062350119905
Test score:  0.9712746858168761


## Optimizing with nlp Theory

In [24]:
# Filtering stop words

vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))

Train score:  0.9928057553956835
Test score:  0.9766606822262118


### Lemmatization

In [30]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [34]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        words_and_tags = nltk.pos_tag(tokens)
        return list(self.wnl.lemmatize(word, pos=get_wordnet_pos(tag)) 
            for word, tag in words_and_tags
        )
    

In [45]:
# Using lemmatization

vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))



Train score:  0.9934052757793765
Test score:  0.9694793536804309


### Stemming

In [41]:
class StemTokenizer:
    def __init__(self):
        self.porter = PorterStemmer()
    
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        return [self.porter.stem(t) for t in tokens]
    

In [46]:
# Using Stemming

vectorizer = CountVectorizer(tokenizer=StemTokenizer(), stop_words='english')
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))



Train score:  0.9922062350119905
Test score:  0.9730700179533214


### Simple Tokenizer (Split)

In [43]:
def simple_tokenizer(s):
    return s.split()

In [44]:
# Using simple tokenizer

vectorizer = CountVectorizer(tokenizer=simple_tokenizer)
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))



Train score:  0.9952038369304557
Test score:  0.9712746858168761
