### from here: https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

pre-trained word embeddings: https://fasttext.cc/docs/en/english-vectors.html

In [18]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import classification_report

import pandas as pd, numpy as np, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [6]:
data = pd.read_csv('datasets/synthatic dataset.csv')
data = data[['Response', 'Label']]

data.head()

Unnamed: 0,Response,Label
0,Here is our forecast,0
1,Traveling to have a business meeting takes th...,0
2,test successful. way to go!!!,0
3,"Randy, Can you send me a schedule of the sal...",0
4,Let's shoot for Tuesday at 11:45.,0


In [8]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data['Response'], data['Label'],test_size=0.3)


### Features enginering:

2.1 Count Vectors as features

2.2 TF-IDF Vectors as features
- Word level
- N-Gram level
- Character level

2.3 Word Embeddings as features

2.4 Text / NLP based features

2.5 Topic Models as features

### 2.1 Count Vectors as features

In [10]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(data['Response'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

### 2.2 TF-IDF Vectrs as features
TF-IDF score represents the relative importance of a term in the document and the entire corpus. TF-IDF score is composed by two terms: the first computes the normalized Term Frequency (TF), the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

* TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
* IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

TF-IDF Vectors can be generated at different levels of input tokens (words, characters, n-grams)

- a. Word Level TF-IDF : Matrix representing tf-idf scores of every term in different documents
- b. N-gram Level TF-IDF : N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams
- c. Character Level TF-IDF : Matrix representing tf-idf scores of character level n-grams in the corpus

In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(data['Response'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(data['Response'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(data['Response'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



### 2.3 Word Embeddings as features


In [None]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('data/wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(data['Response'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Model building

In [19]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    target_names = ['Clean text', 'Dirty text']
    
    print(classification_report(valid_y, predictions, target_names=target_names))
    
    return metrics.accuracy_score(predictions, valid_y)

## Naive Bayes

In [22]:
# Naive Bayes on Count Vectors
print('Naive Bayes')
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy*100,'%')

# Naive Bayes on Word Level TF IDF Vectors
print('Naive Bayes')
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy*100,'%')

# Naive Bayes on Ngram Level TF IDF Vectors
print('Naive Bayes')
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy*100,'%')

# Naive Bayes on Character Level TF IDF Vectors
# accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
# print ("NB, CharLevel Vectors: ", accuracy*100,'%')

Naive Bayes
              precision    recall  f1-score   support

  Clean text       1.00      1.00      1.00       582
  Dirty text       1.00      1.00      1.00       618

    accuracy                           1.00      1200
   macro avg       1.00      1.00      1.00      1200
weighted avg       1.00      1.00      1.00      1200

NB, Count Vectors:  100.0 %
Naive Bayes
              precision    recall  f1-score   support

  Clean text       0.81      1.00      0.90       582
  Dirty text       1.00      0.78      0.88       618

    accuracy                           0.89      1200
   macro avg       0.91      0.89      0.89      1200
weighted avg       0.91      0.89      0.89      1200

NB, WordLevel TF-IDF:  88.75 %
Naive Bayes
              precision    recall  f1-score   support

  Clean text       0.66      1.00      0.80       582
  Dirty text       0.99      0.52      0.68       618

    accuracy                           0.75      1200
   macro avg       0.83      0.76

## Linear Classifier

In [23]:
# Linear Classifier on Count Vectors
print('Linear Classifier')
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy*100,'%')

# Linear Classifier on Word Level TF IDF Vectors
print('Linear Classifier')
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy*100,'%')

# Linear Classifier on Ngram Level TF IDF Vectors
print('Linear Classifier')
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy*100,'%')

# Linear Classifier on Character Level TF IDF Vectors
# accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
# print ("LR, CharLevel Vectors: ", accuracy*100,'%')

Linear Classifier
              precision    recall  f1-score   support

  Clean text       1.00      1.00      1.00       582
  Dirty text       1.00      1.00      1.00       618

    accuracy                           1.00      1200
   macro avg       1.00      1.00      1.00      1200
weighted avg       1.00      1.00      1.00      1200

LR, Count Vectors:  100.0 %
Linear Classifier
              precision    recall  f1-score   support

  Clean text       1.00      0.99      1.00       582
  Dirty text       0.99      1.00      1.00       618

    accuracy                           1.00      1200
   macro avg       1.00      1.00      1.00      1200
weighted avg       1.00      1.00      1.00      1200

LR, WordLevel TF-IDF:  99.66666666666667 %
Linear Classifier
              precision    recall  f1-score   support

  Clean text       1.00      0.94      0.97       582
  Dirty text       0.95      1.00      0.97       618

    accuracy                           0.97      1200
   

In [24]:
# SVM on Ngram Level TF IDF Vectors
print('SVM on Ngram Level TF IDF Vectors')
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy*100,'%')

SVM on Ngram Level TF IDF Vectors
              precision    recall  f1-score   support

  Clean text       0.99      0.99      0.99       582
  Dirty text       0.99      0.99      0.99       618

    accuracy                           0.99      1200
   macro avg       0.99      0.99      0.99      1200
weighted avg       0.99      0.99      0.99      1200

SVM, N-Gram Vectors:  98.91666666666666 %
