<div>
<img src=https://www.institutedata.com/wp-content/uploads/2019/10/iod_h_tp_primary_c.svg width="300">
</div>

# Lab 9.7: Text Classification
INSTRUCTIONS:
- Run the cells
- Observe and understand the results
- Answer the questions

## Import libraries

In [1]:
## Import Libraries
import numpy as np
import pandas as pd

import string
import spacy

from collections import Counter

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# import warnings
# warnings.filterwarnings('ignore')

## Load data

Sample:

    __label__2 Stuning even for the non-gamer: This sound ...
    __label__2 The best soundtrack ever to anything.: I'm ...
    __label__2 Amazing!: This soundtrack is my favorite m ...
    __label__2 Excellent Soundtrack: I truly like this so ...
    __label__2 Remember, Pull Your Jaw Off The Floor Afte ...
    __label__2 an absolute masterpiece: I am quite sure a ...
    __label__1 Buyer beware: This is a self-published boo ...
    . . .
    
There are only two **labels**:
- `__label__1`
- `__label__2`

In [3]:
## Loading the data

trainDF = pd.read_fwf(
    filepath_or_buffer = 'C:/Users/carls/OneDrive/Desktop/IOD/Data/corpus.txt',
    colspecs = [(9, 10),   # label: get only the numbers 1 or 2
                (11, 9000) # text: makes the it big enought to get to the end of the line
               ], 
    header = 0,
    names = ['label', 'text'],
    lineterminator = '\n'
)

# convert label from [1, 2] to [0, 1]
trainDF['label'] = trainDF['label'] - 1

## Inspect the data

In [4]:
trainDF.head()

Unnamed: 0,label,text
0,1,The best soundtrack ever to anything.: I'm rea...
1,1,Amazing!: This soundtrack is my favorite music...
2,1,Excellent Soundtrack: I truly like this soundt...
3,1,"Remember, Pull Your Jaw Off The Floor After He..."
4,1,an absolute masterpiece: I am quite sure any o...


In [5]:
trainDF.shape

(9999, 2)

In [6]:
trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   9999 non-null   int64 
 1   text    9999 non-null   object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


## Split the data into train and test

In [10]:
## ANSWER
## split the dataset
X = trainDF['text']
y = trainDF['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

## Feature Engineering

### Count Vectors as features

In [11]:
# create a count vectorizer object
count_vect = CountVectorizer(token_pattern = r'\w{1,}')

# Learn a vocabulary dictionary of all tokens in the raw documents
count_vect.fit(trainDF['text'])

# Transform documents to document-term matrix.
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

### TF-IDF Vectors as features
- Word level
- N-Gram level
- Character level

In [12]:
%%time
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer = 'word',
                             token_pattern = r'\w{1,}',
                             max_features = 5000)
print(tfidf_vect)

tfidf_vect.fit(trainDF['text'])
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf  = tfidf_vect.transform(X_test)

TfidfVectorizer(max_features=5000, token_pattern='\\w{1,}')
Wall time: 1.82 s


In [15]:
print(tfidf_vect.get_feature_names())



In [13]:
%%time
# ngram level tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer = 'word',
                                   token_pattern = r'\w{1,}',
                                   ngram_range = (2, 3),
                                   max_features = 5000)
print(tfidf_vect_ngram)

tfidf_vect_ngram.fit(trainDF['text'])
X_train_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
X_test_tfidf_ngram  = tfidf_vect_ngram.transform(X_test)

TfidfVectorizer(max_features=5000, ngram_range=(2, 3), token_pattern='\\w{1,}')
Wall time: 8.16 s


In [16]:
print(tfidf_vect_ngram.get_feature_names())

['1 2', '1 star', '10 minutes', '10 years', '100 years', '15 minutes', '1984 is', '2 0', '2 3', '2 and', '2 hours', '2 months', '2 stars', '2 weeks', '20th century', '3 months', '3 weeks', '30 minutes', '4 5', '4 stars', '4 year', '4 year old', '451 is', '5 1', '5 star', '5 stars', '5 year', '6 months', '70 s', '80 s', 'a 4', 'a 5', 'a bad', 'a beautiful', 'a better', 'a big', 'a big fan', 'a bit', 'a bit of', 'a book', 'a book about', 'a book that', 'a bunch', 'a bunch of', 'a cd', 'a chance', 'a chance to', 'a character', 'a charm', 'a cheap', 'a child', 'a christmas', 'a class', 'a classic', 'a collection', 'a collection of', 'a complete', 'a copy', 'a couple', 'a couple of', 'a day', 'a decent', 'a different', 'a disappointment', 'a dvd', 'a family', 'a fan', 'a fan of', 'a fantastic', 'a fascinating', 'a favor', 'a favor and', 'a few', 'a few days', 'a few times', 'a few years', 'a film', 'a fine', 'a first', 'a friend', 'a full', 'a fun', 'a game', 'a gift', 'a good', 'a good boo

In [14]:
%%time
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer = 'char',
                                         token_pattern = r'\w{1,}',
                                         ngram_range = (2, 3),
                                         max_features = 5000)
print(tfidf_vect_ngram_chars)

tfidf_vect_ngram_chars.fit(trainDF['text'])
X_train_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_train)
X_test_tfidf_ngram_chars  = tfidf_vect_ngram_chars.transform(X_test)

TfidfVectorizer(analyzer='char', max_features=5000, ngram_range=(2, 3),
                token_pattern='\\w{1,}')




Wall time: 14.7 s


In [17]:
print(tfidf_vect_ngram_chars.get_feature_names())

[' !', ' !!', ' "', ' "a', ' "b', ' "c', ' "d', ' "e', ' "f', ' "g', ' "h', ' "i', ' "l', ' "m', ' "n', ' "o', ' "p', ' "r', ' "s', ' "t', ' "u', ' "w', ' $', ' $1', ' $2', ' &', ' & ', " '", ' (', ' (a', ' (b', ' (c', ' (d', ' (e', ' (f', ' (h', ' (i', ' (l', ' (m', ' (n', ' (o', ' (p', ' (r', ' (s', ' (t', ' (w', ' *', ' ,', ' , ', ' -', ' - ', ' --', ' .', ' . ', ' ..', ' 1', ' 1 ', ' 1.', ' 1/', ' 10', ' 11', ' 12', ' 15', ' 18', ' 19', ' 2', ' 2 ', ' 2.', ' 20', ' 25', ' 2n', ' 3', ' 3 ', ' 30', ' 3d', ' 4', ' 4 ', ' 40', ' 45', ' 5', ' 5 ', ' 50', ' 6', ' 6 ', ' 7', ' 7 ', ' 8', ' 8 ', ' 80', ' 9', ' 9 ', ' 90', ' :', ' [', ' a', ' a ', ' ab', ' ac', ' ad', ' af', ' ag', ' ah', ' ai', ' al', ' am', ' an', ' ap', ' ar', ' as', ' at', ' au', ' av', ' aw', ' ay', ' b', ' ba', ' be', ' bi', ' bl', ' bo', ' br', ' bu', ' by', ' c', ' ca', ' cd', ' ce', ' ch', ' ci', ' cl', ' co', ' cr', ' cu', ' cy', ' d', ' da', ' de', ' di', ' do', ' dr', ' du', ' dv', ' dy', ' e', ' e-', ' ea', ' e

### Text / NLP based features

Create some other features.

Char_Count = Number of Characters in Text

Word Count = Number of Words in Text

Word Density = Average Number of Char in Words

Punctuation Count = Number of Punctuation in Text

Title Word Count = Number of Words in Title

Uppercase Word Count = Number of Upperwords in Text

In [33]:
%%time
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count'] + 1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len(''.join(_ for _ in x if _ in string.punctuation)))
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([w for w in x.split() if w.istitle()]))
trainDF['uppercase_word_count'] = trainDF['text'].apply(lambda x: len([w for w in x.split() if w.isupper()]))

Wall time: 908 ms


In [34]:
trainDF.sample(10)

Unnamed: 0,label,text,adj_count,adv_count,noun_count,num_count,pron_count,propn_count,verb_count,char-count,word_count,char_count,word_density,punctuation_count,title_word_count,uppercase_word_count
7362,0,451 review by Jessica C.: when i read Farenhei...,0,0,0,0,0,0,0,679,135,679,4.992647,21,7,2
2959,0,"The graphics were good, that's about it.: I am...",0,0,0,0,0,0,0,482,91,482,5.23913,14,11,5
1299,0,Blurred Audio: Audio is technically flawed. Th...,0,0,0,0,0,0,0,341,50,341,6.686275,7,12,1
4489,0,I don't believe Manson had anything to do with...,0,0,0,0,0,0,0,271,52,271,5.113208,10,7,1
2734,1,Threesome with tanks: Supposed being victimize...,0,0,0,0,0,0,0,779,116,779,6.65812,26,11,1
824,0,Far too little crime and fist fights: So I hea...,0,0,0,0,0,0,0,915,173,915,5.258621,36,27,11
3796,0,weak advice: The best thing I found in this bo...,0,0,0,0,0,0,0,458,78,458,5.797468,10,6,3
4783,0,"If you want to know about New Orleans Voodoo, ...",0,0,0,0,0,0,0,442,77,442,5.666667,11,21,0
9852,0,Not Worth My Time: I was hoping to find anothe...,0,0,0,0,0,0,0,874,164,874,5.29697,20,27,11
3451,1,An excellent collection.: This CD is a wonderf...,0,0,0,0,0,0,0,277,48,277,5.653061,8,12,3


In [35]:
## load spaCy
nlp = spacy.load('en_core_web_sm')

Part of Speech in **SpaCy**

    POS   DESCRIPTION               EXAMPLES
    ----- ------------------------- ---------------------------------------------
    ADJ   adjective                 big, old, green, incomprehensible, first
    ADP   adposition                in, to, during
    ADV   adverb                    very, tomorrow, down, where, there
    AUX   auxiliary                 is, has (done), will (do), should (do)
    CONJ  conjunction               and, or, but
    CCONJ coordinating conjunction  and, or, but
    DET   determiner                a, an, the
    INTJ  interjection              psst, ouch, bravo, hello
    NOUN  noun                      girl, cat, tree, air, beauty
    NUM   numeral                   1, 2017, one, seventy-seven, IV, MMXIV
    PART  particle                  's, not,
    PRON  pronoun                   I, you, he, she, myself, themselves, somebody
    PROPN proper noun               Mary, John, London, NATO, HBO
    PUNCT punctuation               ., (, ), ?
    SCONJ subordinating conjunction if, while, that
    SYM   symbol                    $, %, §, ©, +, −, ×, ÷, =, :), 😝
    VERB  verb                      run, runs, running, eat, ate, eating
    X     other                     sfpksdpsxmsa
    SPACE space
    
Find out number of Adjective, Adverb, Noun, Numeric, Pronoun, Proposition, Verb.

    Hint:
    1. Convert text to spacy document
    2. Use pos_
    3. Use Counter 

In [36]:
# Initialise some columns for feature's counts
trainDF['adj_count'] = 0
trainDF['adv_count'] = 0
trainDF['noun_count'] = 0
trainDF['num_count'] = 0
trainDF['pron_count'] = 0
trainDF['propn_count'] = 0
trainDF['verb_count'] = 0

In [37]:
trainDF

Unnamed: 0,label,text,adj_count,adv_count,noun_count,num_count,pron_count,propn_count,verb_count,char-count,word_count,char_count,word_density,punctuation_count,title_word_count,uppercase_word_count
0,1,The best soundtrack ever to anything.: I'm rea...,0,0,0,0,0,0,0,509,97,509,5.193878,14,7,3
1,1,Amazing!: This soundtrack is my favorite music...,0,0,0,0,0,0,0,760,129,760,5.846154,40,24,4
2,1,Excellent Soundtrack: I truly like this soundt...,0,0,0,0,0,0,0,743,118,743,6.243697,33,52,4
3,1,"Remember, Pull Your Jaw Off The Floor After He...",0,0,0,0,0,0,0,481,87,481,5.465909,22,30,0
4,1,an absolute masterpiece: I am quite sure any o...,0,0,0,0,0,0,0,825,142,825,5.769231,35,14,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,A revelation of life in small town America in ...,0,0,0,0,0,0,0,867,152,867,5.666667,25,14,3
9995,1,Great biography of a very interesting journali...,0,0,0,0,0,0,0,861,141,861,6.063380,14,16,0
9996,0,Interesting Subject; Poor Presentation: You'd ...,0,0,0,0,0,0,0,650,108,650,5.963303,17,11,0
9997,0,Don't buy: The box looked used and it is obvio...,0,0,0,0,0,0,0,135,27,135,4.821429,6,2,1


In [38]:
cols = [
    'char_count', 'word_count', 'word_density',
    'punctuation_count', 'title_word_count',
    'uppercase_word_count', 'adj_count',
    'adv_count', 'noun_count', 'num_count',
    'pron_count', 'propn_count', 'verb_count']

trainDF[cols].sample(5)

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,uppercase_word_count,adj_count,adv_count,noun_count,num_count,pron_count,propn_count,verb_count
8742,655,112,5.79646,31,18,3,0,0,0,0,0,0,0
8615,679,108,6.229358,9,6,0,0,0,0,0,0,0,0
9385,286,55,5.107143,17,6,1,0,0,0,0,0,0,0
3687,565,95,5.885417,35,20,6,0,0,0,0,0,0,0
8064,281,50,5.509804,10,4,1,0,0,0,0,0,0,0


### Topic Models as features

In [39]:
%%time
# train a LDA Model
lda_model = LatentDirichletAllocation(n_components = 20, learning_method = 'online', max_iter = 20)

X_topics = lda_model.fit_transform(X_train_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

Wall time: 1min 33s


In [40]:
# view the topic models
n_top_words = 10
topic_summaries = []
print('Group Top Words')
print('-----', '-'*80)
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    top_words = ' '.join(topic_words)
    topic_summaries.append(top_words)
    print('  %3d %s' % (i, top_words))

Group Top Words
----- --------------------------------------------------------------------------------
    0 his romantic social novel tales anti history blues shakespeare figures
    1 descent apparently gifted intense meet gore hey hilarious chocolate creatures
    2 the i and a to it of this is in
    3 christmas kindle edition manson jazz missing charlie brown range fully
    4 favor ashamed crawford pins joan wicca punch gas plans preparation
    5 software answer scanner error engaging scan upgrade incorrect errors simpletech
    6 u 5 1 pros cons bd tango video ads 2
    7 bay compatible generated 2011 topper ton duran 2010 7i surround
    8 his orwell 1984 world government brother george future run musical
    9 product for battery amazon charger text ordered adapter apple price
   10 horse billy puzzles anderson granddaughter hence jay tender vibrant birds
   11 sides finishing revealed gillian behavior sean consistently legal dillon makeup
   12 bed air science fiction fit fo

## Modelling

In [41]:
## helper function

def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    return accuracy_score(predictions, y_test)

In [42]:
# Keep the results in a dataframe
results = pd.DataFrame(columns = ['Count Vectors',
                                  'WordLevel TF-IDF',
                                  'N-Gram Vectors',
                                  'CharLevel Vectors'])

### Naive Bayes Classifier

In [43]:
%%time
# Naive Bayes on Count Vectors
accuracy1 = train_model(MultinomialNB(), X_train_count, y_train, X_test_count)
print('NB, Count Vectors    : %.4f\n' % accuracy1)

NB, Count Vectors    : 0.8475

Wall time: 9.98 ms


In [44]:
%%time
# Naive Bayes on Word Level TF IDF Vectors
accuracy2 = train_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print('NB, WordLevel TF-IDF : %.4f\n' % accuracy2)

NB, WordLevel TF-IDF : 0.8475

Wall time: 7.98 ms


In [45]:
%%time
# Naive Bayes on Ngram Level TF IDF Vectors
accuracy3 = train_model(MultinomialNB(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('NB, N-Gram Vectors   : %.4f\n' % accuracy3)

NB, N-Gram Vectors   : 0.8450

Wall time: 6.98 ms


In [46]:
%%time
# # Naive Bayes on Character Level TF IDF Vectors
accuracy4 = train_model(MultinomialNB(), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('NB, CharLevel Vectors: %.4f\n' % accuracy4)

NB, CharLevel Vectors: 0.8305

Wall time: 36.9 ms


In [47]:
results.loc['Naïve Bayes'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

In [48]:
results

Unnamed: 0,Count Vectors,WordLevel TF-IDF,N-Gram Vectors,CharLevel Vectors
Naïve Bayes,0.8475,0.8475,0.845,0.8305


### Linear Classifier

In [49]:
%%time
# Linear Classifier on Count Vectors
accuracy1 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 350), X_train_count, y_train, X_test_count)
print('LR, Count Vectors    : %.4f\n' % accuracy1)

LR, Count Vectors    : 0.8675

Wall time: 2.47 s


In [50]:
%%time
# Linear Classifier on Word Level TF IDF Vectors
accuracy2 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 100), X_train_tfidf, y_train, X_test_tfidf)
print('LR, WordLevel TF-IDF : %.4f\n' % accuracy2)

LR, WordLevel TF-IDF : 0.8750

Wall time: 317 ms


In [51]:
%%time
# Linear Classifier on Ngram Level TF IDF Vectors
accuracy3 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 100), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('LR, N-Gram Vectors   : %.4f\n' % accuracy3)

LR, N-Gram Vectors   : 0.8430

Wall time: 102 ms


In [52]:
%%time
# Linear Classifier on Character Level TF IDF Vectors
accuracy4 = train_model(LogisticRegression(solver = 'lbfgs', max_iter = 100), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('LR, CharLevel Vectors: %.4f\n' % accuracy4)

LR, CharLevel Vectors: 0.8545

Wall time: 833 ms


In [53]:
results.loc['Logistic Regression'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

In [54]:
results

Unnamed: 0,Count Vectors,WordLevel TF-IDF,N-Gram Vectors,CharLevel Vectors
Naïve Bayes,0.8475,0.8475,0.845,0.8305
Logistic Regression,0.8675,0.875,0.843,0.8545


### Support Vector Machine

In [55]:
%%time
# Support Vector Machine on Count Vectors
accuracy1 = train_model(LinearSVC(), X_train_count, y_train, X_test_count)
print('SVM, Count Vectors    : %.4f\n' % accuracy1)

SVM, Count Vectors    : 0.8470

Wall time: 900 ms


In [56]:
%%time
# Support Vector Machine on Word Level TF IDF Vectors
accuracy2 = train_model(LinearSVC(), X_train_tfidf, y_train, X_test_tfidf)
print('SVM, WordLevel TF-IDF : %.4f\n' % accuracy2)

SVM, WordLevel TF-IDF : 0.8675

Wall time: 90.8 ms


In [57]:
%%time
# Support Vector Machine on Ngram Level TF IDF Vectors
accuracy3 = train_model(LinearSVC(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('SVM, N-Gram Vectors   : %.4f\n' % accuracy3)

SVM, N-Gram Vectors   : 0.8300

Wall time: 71.8 ms


In [58]:
%%time
# Support Vector Machine on Character Level TF IDF Vectors
accuracy4 = train_model(LinearSVC(), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('SVM, CharLevel Vectors: %.4f\n' % accuracy4)

SVM, CharLevel Vectors: 0.8520

Wall time: 542 ms


In [59]:
results.loc['Support Vector Machine'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

In [60]:
results

Unnamed: 0,Count Vectors,WordLevel TF-IDF,N-Gram Vectors,CharLevel Vectors
Naïve Bayes,0.8475,0.8475,0.845,0.8305
Logistic Regression,0.8675,0.875,0.843,0.8545
Support Vector Machine,0.847,0.8675,0.83,0.852


### Bagging Models

In [61]:
%%time
# Bagging (Random Forest) on Count Vectors
accuracy1 = train_model(RandomForestClassifier(n_estimators = 100), X_train_count, y_train, X_test_count)
print('RF, Count Vectors    : %.4f\n' % accuracy1)

RF, Count Vectors    : 0.8375

Wall time: 21 s


In [62]:
%%time
# Bagging (Random Forest) on Word Level TF IDF Vectors
accuracy2 = train_model(RandomForestClassifier(n_estimators = 100), X_train_tfidf, y_train, X_test_tfidf)
print('RF, WordLevel TF-IDF : %.4f\n' % accuracy2)

RF, WordLevel TF-IDF : 0.8395

Wall time: 9.46 s


In [63]:
%%time
# Bagging (Random Forest) on Ngram Level TF IDF Vectors
accuracy3 = train_model(RandomForestClassifier(n_estimators = 100), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('RF, N-Gram Vectors   : %.4f\n' % accuracy3)

RF, N-Gram Vectors   : 0.7905

Wall time: 9 s


In [64]:
%%time
# Bagging (Random Forest) on Character Level TF IDF Vectors
accuracy4 = train_model(RandomForestClassifier(n_estimators = 100), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('RF, CharLevel Vectors: %.4f\n' % accuracy4)

RF, CharLevel Vectors: 0.7965

Wall time: 33.2 s


In [65]:
results.loc['Random Forest'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}
results

Unnamed: 0,Count Vectors,WordLevel TF-IDF,N-Gram Vectors,CharLevel Vectors
Naïve Bayes,0.8475,0.8475,0.845,0.8305
Logistic Regression,0.8675,0.875,0.843,0.8545
Support Vector Machine,0.847,0.8675,0.83,0.852
Random Forest,0.8375,0.8395,0.7905,0.7965


### Boosting Models

In [66]:
%%time
# Gradient Boosting on Count Vectors
accuracy1 = train_model(GradientBoostingClassifier(), X_train_count, y_train, X_test_count)
print('GB, Count Vectors    : %.4f\n' % accuracy1)

GB, Count Vectors    : 0.8140

Wall time: 36.7 s


In [67]:
%%time
# Gradient Boosting on Word Level TF IDF Vectors
accuracy2 = train_model(GradientBoostingClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print('GB, WordLevel TF-IDF : %.4f\n' % accuracy2)

GB, WordLevel TF-IDF : 0.8055

Wall time: 16.5 s


In [68]:
%%time
# Gradient Boosting on Ngram Level TF IDF Vectors
accuracy3 = train_model(GradientBoostingClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print('GB, N-Gram Vectors   : %.4f\n' % accuracy3)

GB, N-Gram Vectors   : 0.7305

Wall time: 9.7 s


In [69]:
%%time
# Gradient Boosting on Character Level TF IDF Vectors
accuracy4 = train_model(GradientBoostingClassifier(), X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars)
print('GB, CharLevel Vectors: %.4f\n' % accuracy4)

GB, CharLevel Vectors: 0.8120

Wall time: 2min 35s


In [70]:
results.loc['Gradient Boosting'] = {
    'Count Vectors': accuracy1,
    'WordLevel TF-IDF': accuracy2,
    'N-Gram Vectors': accuracy3,
    'CharLevel Vectors': accuracy4}

In [71]:
results

Unnamed: 0,Count Vectors,WordLevel TF-IDF,N-Gram Vectors,CharLevel Vectors
Naïve Bayes,0.8475,0.8475,0.845,0.8305
Logistic Regression,0.8675,0.875,0.843,0.8545
Support Vector Machine,0.847,0.8675,0.83,0.852
Random Forest,0.8375,0.8395,0.7905,0.7965
Gradient Boosting,0.814,0.8055,0.7305,0.812




---



---



> > > > > > > > > © 2021 Institute of Data


---



---



