## TF-IDF Vectiorizer Code
1. Create the model
model = WhateverModelYouLike()

2. Train / Fit the model
model.fit(X_train, Y_train)

3. Check the model's performance
model.score(X_train, Y_train)
model.score(X_test, Y_test)


In [6]:
# Dependencies
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet


In [2]:
# Downloads
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /home/carlosm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/carlosm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/carlosm/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv


In [3]:
# Creating dataset

df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [23]:
# Separating columns (inputs, targets)

inputs = df['text']
labels = df['labels']

print(inputs)

0       Ad sales boost Time Warner profit\n\nQuarterly...
1       Dollar gains on Greenspan speech\n\nThe dollar...
2       Yukos unit buyer faces loan claim\n\nThe owner...
3       High fuel prices hit BA's profits\n\nBritish A...
4       Pernod takeover talk lifts Domecq\n\nShares in...
                              ...                        
2220    BT program to beat dialler scams\n\nBT is intr...
2221    Spam e-mails tempt net shoppers\n\nComputer us...
2222    Be careful how you code\n\nA new European dire...
2223    US cyber security chief resigns\n\nThe man mak...
2224    Losing yourself in online gaming\n\nOnline rol...
Name: text, Length: 2225, dtype: object


In [None]:
# Visualizing the label's values frequency

labels.hist(figsize=(10, 5))

In [5]:
# Getting training / testings datasets

inputs_train, inputs_test, Y_train, Y_test = train_test_split(
    inputs, labels, random_state=123
)


In [7]:
# Creating vectorizer Object

vectorizer = TfidfVectorizer()

In [9]:
# Creating X as vectors

X_train = vectorizer.fit_transform(inputs_train) # "fit" learns the vocabulary for training
X_test = vectorizer.transform(inputs_test) # without "fit" uses the vocabulary learned in training


In [10]:
(X_train != 0).sum()

337411

In [12]:
# What percentage of values are non-zero (sparse metric)

(X_train != 0).sum() / (np.prod(X_train.shape))

# If close to zero Sparse representation is OK

0.007695239935415004

In [14]:
# Modeling and testing

model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))

Train score:  0.9802158273381295
Test score:  0.933572710951526


## Optimizing with nlp Theory

In [15]:
# Filtering stop words

vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))

Train score:  0.9892086330935251
Test score:  0.9605026929982047


### Lemmatization

In [16]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        words_and_tags = nltk.pos_tag(tokens)
        return list(self.wnl.lemmatize(word, pos=get_wordnet_pos(tag)) 
            for word, tag in words_and_tags
        )
    

In [18]:
# Using lemmatization

vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))



Train score:  0.9886091127098321
Test score:  0.9461400359066428


### Stemming

In [19]:
class StemTokenizer:
    def __init__(self):
        self.porter = PorterStemmer()
    
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        return [self.porter.stem(t) for t in tokens]
    

In [20]:
# Using Stemming

vectorizer = TfidfVectorizer(tokenizer=StemTokenizer(), stop_words='english')
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))



Train score:  0.9856115107913669
Test score:  0.9461400359066428


### Simple Tokenizer (Split)

In [21]:
def simple_tokenizer(s):
    return s.split()

In [22]:
# Using simple tokenizer

vectorizer = TfidfVectorizer(tokenizer=simple_tokenizer)
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, Y_train)

print("Train score: ", model.score(X_train, Y_train))
print("Test score: ", model.score(X_test, Y_test))



Train score:  0.9766187050359713
Test score:  0.9120287253141831
