# TEXT CLASSIFICATION

In [54]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string

In [56]:
# load the dataset
# dataset https://gist.github.com/kunalj101/ad1d9c58d338e20d09ff26bcc06c4235
data = open('corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels
trainDF['text']

0       Stuning even for the non-gamer: This sound tra...
1       The best soundtrack ever to anything.: I'm rea...
2       Amazing!: This soundtrack is my favorite music...
3       Excellent Soundtrack: I truly like this soundt...
4       Remember, Pull Your Jaw Off The Floor After He...
5       an absolute masterpiece: I am quite sure any o...
6       Buyer beware: This is a self-published book, a...
7       Glorious story: I loved Whisper of the wicked ...
8       A FIVE STAR BOOK: I just finished reading Whis...
9       Whispers of the Wicked Saints: This was a easy...
10      The Worst!: A complete waste of time. Typograp...
11      Great book: This was a great book,I just could...
12      Great Read: I thought this book was brilliant,...
13      Oh please: I guess you have to be a romance no...
14      Awful beyond belief!: I feel I have to write t...
15      Don't try to fool us with fake reviews.: It's ...
16      A romantic zen baseball comedy: When you hear ...
17      Fashio

In [57]:
trainDF['label']

0       __label__2
1       __label__2
2       __label__2
3       __label__2
4       __label__2
5       __label__2
6       __label__1
7       __label__2
8       __label__2
9       __label__2
10      __label__1
11      __label__2
12      __label__2
13      __label__1
14      __label__1
15      __label__1
16      __label__2
17      __label__2
18      __label__2
19      __label__1
20      __label__1
21      __label__2
22      __label__1
23      __label__2
24      __label__2
25      __label__1
26      __label__1
27      __label__1
28      __label__1
29      __label__2
           ...    
9970    __label__1
9971    __label__1
9972    __label__1
9973    __label__2
9974    __label__1
9975    __label__1
9976    __label__2
9977    __label__2
9978    __label__2
9979    __label__2
9980    __label__2
9981    __label__2
9982    __label__1
9983    __label__2
9984    __label__1
9985    __label__2
9986    __label__2
9987    __label__2
9988    __label__2
9989    __label__2
9990    __label__2
9991    __la

In [58]:
# analyse text
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(
    lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
trainDF['title_word_count'] = trainDF['text'].apply(
    lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(
    lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [59]:
# create additional features
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger')
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/michal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/michal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [60]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [33]:
train_y[0:5]

array([0, 0, 1, 1, 0])

In [34]:
valid_x[0:5]

6909    21st century reading: It was interesting to re...
292     save your $$$: I can't believe how much this c...
3850    The Sword in The Stone: There are parts in the...
2679    incorrect sewing instructions: There are so ma...
207     If it ain't broke don't fix it: I have been us...
Name: text, dtype: object

In [35]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)
print(xtrain_count)

  (0, 26)	1
  (0, 275)	1
  (0, 710)	3
  (0, 988)	1
  (0, 1743)	1
  (0, 1938)	1
  (0, 2678)	1
  (0, 2781)	1
  (0, 3087)	1
  (0, 4400)	2
  (0, 4582)	1
  (0, 6728)	1
  (0, 8755)	1
  (0, 8762)	1
  (0, 9795)	1
  (0, 10142)	2
  (0, 10394)	1
  (0, 11024)	3
  (0, 11820)	1
  (0, 12163)	1
  (0, 12271)	1
  (0, 12493)	1
  (0, 13847)	1
  (0, 14030)	2
  (0, 14768)	1
  :	:
  (7499, 23239)	1
  (7499, 24262)	1
  (7499, 24866)	1
  (7499, 25386)	1
  (7499, 25425)	1
  (7499, 26077)	1
  (7499, 26677)	1
  (7499, 27587)	1
  (7499, 27696)	1
  (7499, 28082)	6
  (7499, 28224)	1
  (7499, 28249)	1
  (7499, 28315)	1
  (7499, 28493)	2
  (7499, 28584)	1
  (7499, 29391)	1
  (7499, 29417)	1
  (7499, 29818)	1
  (7499, 29898)	1
  (7499, 30316)	1
  (7499, 30541)	1
  (7499, 30918)	1
  (7499, 31113)	1
  (7499, 31483)	1
  (7499, 31506)	1


In [36]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [37]:
print(xtrain_tfidf[:2])

  (0, 4941)	0.2580167796356361
  (0, 4849)	0.09855559618861329
  (0, 4813)	0.13851312956962655
  (0, 4506)	0.11394979183069295
  (0, 4458)	0.07552900494794133
  (0, 4428)	0.16442666014463553
  (0, 4426)	0.05119439601073296
  (0, 4422)	0.09145759631964163
  (0, 4338)	0.12030476428473885
  (0, 4173)	0.12778800405491927
  (0, 4085)	0.08971814786408948
  (0, 3918)	0.18136725492570407
  (0, 3874)	0.2085474918644314
  (0, 3806)	0.14571350441939013
  (0, 3777)	0.060359280470109924
  (0, 3303)	0.12321495138839567
  (0, 3281)	0.1689598782332187
  (0, 3110)	0.12934244294148828
  (0, 3081)	0.06657922623620655
  (0, 3061)	0.16312225005526362
  (0, 3025)	0.05571602519796274
  (0, 2929)	0.12692377167775876
  (0, 2905)	0.08252339558366005
  (0, 2786)	0.13338358666727274
  (0, 2458)	0.07694827445993291
  :	:
  (1, 2684)	0.15133283109451914
  (1, 2390)	0.0627880607252985
  (1, 2275)	0.13294007924025758
  (1, 2267)	0.03768492157317002
  (1, 2235)	0.056159662183382544
  (1, 2223)	0.12209667335983694
  (1

In [39]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger')
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/michal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/michal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [40]:
trainDF

Unnamed: 0,text,label,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,Stuning even for the non-gamer: This sound tra...,__label__2,426,80,5.259259,11,10,3,20,15,6,6,11
1,The best soundtrack ever to anything.: I'm rea...,__label__2,509,97,5.193878,14,7,3,20,23,9,3,10
2,Amazing!: This soundtrack is my favorite music...,__label__2,760,129,5.846154,40,24,4,39,18,13,10,11
3,Excellent Soundtrack: I truly like this soundt...,__label__2,743,118,6.243697,33,52,4,52,12,9,2,7
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2,481,87,5.465909,22,30,0,31,13,7,2,9
5,an absolute masterpiece: I am quite sure any o...,__label__2,825,142,5.769231,35,14,3,30,17,19,17,9
6,"Buyer beware: This is a self-published book, a...",__label__1,738,139,5.271429,33,16,4,35,22,8,15,13
7,Glorious story: I loved Whisper of the wicked ...,__label__2,522,105,4.924528,13,13,6,20,23,8,10,15
8,A FIVE STAR BOOK: I just finished reading Whis...,__label__2,524,103,5.038462,11,15,13,22,20,6,11,14
9,Whispers of the Wicked Saints: This was a easy...,__label__2,301,63,4.703125,8,8,2,8,19,4,4,6


METHOD TO FIT AND CHECK ACCURACY OF SELECTED MODEL

In [41]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

CLASSIFIER NAIVE BAYES

In [42]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.8348
NB, WordLevel TF-IDF:  0.8468
NB, N-Gram Vectors:  0.8372
NB, CharLevel Vectors:  0.8064


CLASSIFIER LINEAR

In [43]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LR, Count Vectors:  0.8668
LR, WordLevel TF-IDF:  0.8716
LR, N-Gram Vectors:  0.8368
LR, CharLevel Vectors:  0.8432


CLASSIFIER SVM

In [44]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.834


CLASSIFIER RANDOM FORREST

In [45]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.8408
RF, WordLevel TF-IDF:  0.8396


CLASSIFIER EXTREME GRADIENT BOOSTING

In [46]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.8048
Xgb, WordLevel TF-IDF:  0.8096
Xgb, CharLevel Vectors:  0.812


In [52]:
#using NLP in app
import pickle

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_count, train_y)

#save vectorizer.vocabulary_
pickle.dump(count_vect.vocabulary_,open("dumped_vocabulary.pkl","wb"))
#save model with open('dumped_model.pkl', 'wb') as picklefile:
pickle.dump(classifier, open("dumped_model.pkl","wb"))

#import vocabulary
cVWithLoadedVocabulary = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("dumped_vocabulary.pkl", "rb")))

with open('dumped_model.pkl', 'rb') as training_model:
    model = pickle.load(open("dumped_model.pkl","rb"))
    
xtrain_count =  cVWithLoadedVocabulary.transform(train_x)
model.predict(xtrain_count)


#tfidf = transformer.fit_transform(cVWithLoadedVocabulary.fit_transform(np.array(data))) p = model.predict(tfidf)

array([0, 0, 1, ..., 1, 1, 0])