In [17]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import classification_report

import pandas as pd, numpy as np, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [18]:
data = pd.read_csv('datasets/synthatic dataset.csv')
data = data[['Response', 'Label']]

data.head()

Unnamed: 0,Response,Label
0,Here is our forecast,0
1,Traveling to have a business meeting takes th...,0
2,test successful. way to go!!!,0
3,"Randy, Can you send me a schedule of the sal...",0
4,Let's shoot for Tuesday at 11:45.,0


In [3]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data['Response'], data['Label'],test_size=0.3)


In [4]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(data['Response'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(data['Response'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(data['Response'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



## word level tf-idf


In [8]:
# def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):

svcword = svm.SVC()
# fit the training dataset on the classifier
svcword.fit(xtrain_tfidf, train_y)

# predict the labels on validation dataset
predictions = svcword.predict(xvalid_tfidf)

target_names = ['Clean text', 'Dirty text']

print('SVM on Ngram Level TF IDF Vectors')
print(classification_report(valid_y, predictions, target_names=target_names))
print ("SVM, N-Gram Vectors: ", metrics.accuracy_score(predictions, valid_y)*100,'%')

# predict the labels on validation dataset
predictions = svcword.predict(xtrain_tfidf)

target_names = ['Clean text', 'Dirty text']

print('SVM on Ngram Level TF IDF Vectors')
print(classification_report(train_y, predictions, target_names=target_names))
print ("SVM, N-Gram Vectors: ", metrics.accuracy_score(predictions, train_y)*100,'%')

SVM on Ngram Level TF IDF Vectors
              precision    recall  f1-score   support

  Clean text       1.00      1.00      1.00       592
  Dirty text       1.00      1.00      1.00       608

    accuracy                           1.00      1200
   macro avg       1.00      1.00      1.00      1200
weighted avg       1.00      1.00      1.00      1200

SVM, N-Gram Vectors:  100.0 %
SVM on Ngram Level TF IDF Vectors
              precision    recall  f1-score   support

  Clean text       1.00      1.00      1.00      1408
  Dirty text       1.00      1.00      1.00      1392

    accuracy                           1.00      2800
   macro avg       1.00      1.00      1.00      2800
weighted avg       1.00      1.00      1.00      2800

SVM, N-Gram Vectors:  100.0 %


## ngram level tf-idf 

In [6]:
# def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):

svcngram = svm.SVC()
# fit the training dataset on the classifier
svcngram.fit(xtrain_tfidf_ngram, train_y)

# predict the labels on validation dataset
predictions = svcngram.predict(xvalid_tfidf_ngram)

target_names = ['Clean text', 'Dirty text']

print('SVM on Ngram Level TF IDF Vectors')
print(classification_report(valid_y, predictions, target_names=target_names))
print ("SVM, N-Gram Vectors: ", metrics.accuracy_score(predictions, valid_y)*100,'%')

SVM on Ngram Level TF IDF Vectors
              precision    recall  f1-score   support

  Clean text       0.99      0.99      0.99       592
  Dirty text       0.99      0.99      0.99       608

    accuracy                           0.99      1200
   macro avg       0.99      0.99      0.99      1200
weighted avg       0.99      0.99      0.99      1200

SVM, N-Gram Vectors:  99.16666666666667 %


In [7]:
# predict the labels on validation dataset
predictions = svcngram.predict(xtrain_tfidf_ngram)

target_names = ['Clean text', 'Dirty text']

print('SVM on Ngram Level TF IDF Vectors')
print(classification_report(train_y, predictions, target_names=target_names))
print ("SVM, N-Gram Vectors: ", metrics.accuracy_score(predictions, train_y)*100,'%')

SVM on Ngram Level TF IDF Vectors
              precision    recall  f1-score   support

  Clean text       1.00      0.98      0.99      1408
  Dirty text       0.98      1.00      0.99      1392

    accuracy                           0.99      2800
   macro avg       0.99      0.99      0.99      2800
weighted avg       0.99      0.99      0.99      2800

SVM, N-Gram Vectors:  99.10714285714286 %


In [19]:
target_names[svcngram.predict( tfidf_vect_ngram.transform(['I want to study postgraduate in the united kingdom']))[0]]

'Clean text'

In [20]:
target_names[svcngram.predict( tfidf_vect_ngram.transform(['I fuydsjgfhfsd gfdiogjiper']))[0]]

'Dirty text'

## export and load artifacts test

In [21]:
from joblib import dump, load
dump(svcngram, 'svcNgram-v0.1.joblib')
dump(tfidf_vect_ngram, 'tfidf_vec_gram-v0.1.joblib')

['tfidf_vec_gram-v0.1.joblib']

In [2]:
from joblib import dump, load

clf = load('svcNgram-v0.1.joblib')
tifidf_ngram = load('tfidf_vec_gram-v0.1.joblib')

In [4]:
target_names = ['Clean text', 'Dirty text']


In [5]:
target_names[clf.predict( tifidf_ngram.transform(['I fuydsjgfhfsd gfdiogjiper']))[0]]

'Dirty text'

In [8]:
import time
start = time.process_time()
clf = load('svcNgram-v0.1.joblib')
print(time.process_time() - start, 'seconds')


0.00717300000000165 seconds


In [7]:
start_vectorizer_load_time = time.process_time()
tifidf_ngram = load('tfidf_vec_gram-v0.1.joblib')
print('Vectorizer loaded in',time.process_time() - start_vectorizer_load_time)

Vectorizer loaded in 8.748728999999999


In [13]:
if '' not

AttributeError: 'str' object has no attribute 'empty'

In [24]:
mylistdict = list()
for i in data.columns:
    mylistdict.append({'label': i, 'value': i})
    
print(mylistdict)

[{'label': 'Response', 'value': 'Response'}, {'label': 'Label', 'value': 'Label'}]
