Reference: https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [2]:
# cite 
# https://stackoverflow.com/questions/17390326/getting-rid-of-stop-words-and-document-tokenization-using-nltk

import string
import nltk
stopwords = nltk.corpus.stopwords.words('english') + list(string.punctuation)

def tokenize(text: str) -> list:
    tokens = nltk.word_tokenize(text.lower())
    return [t for t in tokens if t not in stopwords]

In [3]:
data_df = pd.read_csv('/data/khodadaa/stack_results/stack_feat/18_ml_in.csv', 
                      usecols=['Query', 'Y', '18', '100', 'TestViewCount', 'ql_t', 'ql_t.1'])
X = data_df['Query']
y = data_df['Y']

In [4]:
%time data_df['tokens'] = data_df.Query.apply(tokenize)

CPU times: user 2min 59s, sys: 994 ms, total: 3min
Wall time: 2min 58s


In [5]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y, test_size=0.33)

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [6]:
count_vec = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
%time count_vec.fit(data_df['Query'])

%time xtrain_count = count_vec.transform(train_X)
%time xtest_count = count_vec.transform(test_X)

CPU times: user 11.6 s, sys: 362 ms, total: 12 s
Wall time: 12 s
CPU times: user 8.29 s, sys: 102 ms, total: 8.39 s
Wall time: 8.39 s
CPU times: user 3.99 s, sys: 30 ms, total: 4.02 s
Wall time: 4.02 s


In [None]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vec.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [7]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
%time tfidf_vect.fit(data_df['Query'])
%time xtrain_tfidf =  tfidf_vect.transform(train_X)
%time xtest_tfidf =  tfidf_vect.transform(test_X)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
%time tfidf_vect_ngram.fit(data_df['Query'])
%time xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_X)
%time xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_X)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
%time tfidf_vect_ngram_chars.fit(data_df['Query'])
%time xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_X) 
%time xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_X) 

CPU times: user 11.8 s, sys: 399 ms, total: 12.2 s
Wall time: 12.2 s
CPU times: user 8 s, sys: 129 ms, total: 8.12 s
Wall time: 8.12 s
CPU times: user 3.86 s, sys: 23 ms, total: 3.89 s
Wall time: 3.89 s
CPU times: user 1min 18s, sys: 2.43 s, total: 1min 21s
Wall time: 1min 15s
CPU times: user 16.3 s, sys: 8.97 ms, total: 16.4 s
Wall time: 16.4 s
CPU times: user 8.02 s, sys: 3.96 ms, total: 8.03 s
Wall time: 8.03 s
CPU times: user 1min 17s, sys: 3.48 s, total: 1min 21s
Wall time: 1min 21s
CPU times: user 57.5 s, sys: 1.64 s, total: 59.1 s
Wall time: 59.1 s
CPU times: user 28.3 s, sys: 729 ms, total: 29 s
Wall time: 29 s


In [8]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_test)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, test_y), metrics.precision_score(predictions, test_y), np.sum(predictions)/predictions.shape[0]

In [9]:
# DB features as a baseline
cols = ['covered_t_18', 'mean_df_t_18', 'min_df_t_18', 'mean_mean_pop_t_18', 'mean_min_pop_t_18', 'min_mean_pop_t_18', 'min_min_pop_t_18', 'ql_t_18', 'qll_t_18', 'covered_t_bi_18', 'mean_df_t_bi_18', 'min_df_t_bi_18', 'mean_mean_pop_t_bi_18', 'mean_min_pop_t_bi_18', 'min_mean_pop_t_bi_18', 'min_min_pop_t_bi_18', 'ql_t_bi_18', 'qll_t_bi_18', 'covered_t_c18', 'mean_df_t_c18', 'min_df_t_c18', 'mean_mean_pop_t_c18', 'mean_min_pop_t_c18', 'min_mean_pop_t_c18', 'min_min_pop_t_c18', 'ql_t_c18', 'qll_t_c18', 'covered_t_bi_c18', 'mean_df_t_bi_c18', 'min_df_t_bi_c18', 'mean_mean_pop_t_bi_c18', 'mean_min_pop_t_bi_c18', 'min_mean_pop_t_bi_c18', 'min_min_pop_t_bi_c18', 'ql_t_bi_c18', 'qll_t_bi_c18', 'ql_t', 'ql_t.1']
db_df = pd.read_csv("/data/khodadaa/stack_results/stack_feat/18_ml_in_ghadakcv.csv", usecols=cols)
xtrain_db, xtest_db = db_df.loc[train_X.index], db_df.loc[test_X.index]
sc = preprocessing.MinMaxScaler().fit(db_df)
xtrain_db = sc.transform(xtrain_db)
xtest_db = sc.transform(xtest_db)

In [14]:
# Linear Classifier on DB extracted features
scores = train_model(linear_model.LogisticRegression(class_weight='balanced'), xtrain_db, train_y, xtest_db)
print ("LR, DB features: ", scores)

# Linear Classifier on Count Vectors
scores = train_model(linear_model.LogisticRegression(class_weight='balanced'), xtrain_count, train_y, xtest_count)
print ("LR, Count Vectors: ", scores)

# Linear Classifier on Word Level TF IDF Vectors
scores = train_model(linear_model.LogisticRegression(class_weight='balanced'), xtrain_tfidf, train_y, xtest_tfidf)
print ("LR, WordLevel TF-IDF: ", scores)

# Linear Classifier on Ngram Level TF IDF Vectors
scores = train_model(linear_model.LogisticRegression(class_weight='balanced'), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print ("LR, N-Gram Vectors: ", scores)

# Linear Classifier on Character Level TF IDF Vectors
scores = train_model(linear_model.LogisticRegression(class_weight='balanced'), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", scores)

LR, DB features:  (0.5902645978312745, 0.6814455779977756, 0.48306862451303567)
LR, Count Vectors:  (0.6163470182798921, 0.5810872042730231, 0.416425265541238)
LR, WordLevel TF-IDF:  (0.6056254786401625, 0.5873347155666012, 0.42967180545844014)
LR, N-Gram Vectors:  (0.6215995738021509, 0.4594735613560532, 0.3620212211012331)
LR, CharLevel Vectors:  (0.6061277039700773, 0.5824053605019979, 0.42717732716234363)


In [19]:
# SVM

%time scores = train_model(svm.SVC(cache_size=20000, max_iter=1000, class_weight='balanced'), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", scores)

%time scores = train_model(svm.SVC(cache_size=20000, max_iter=1000, class_weight='balanced'), xtrain_db, train_y, xtest_db)
print ("SVM, DB features: ", scores)



CPU times: user 1min 7s, sys: 1.87 s, total: 1min 9s
Wall time: 58.8 s
SVM, N-Gram Vectors:  (0.20225807167671117, 1.0, 1.0)




CPU times: user 3min 36s, sys: 4.32 s, total: 3min 40s
Wall time: 3min 24s
SVM, DB features:  (0.20225807167671117, 1.0, 1.0)


In [20]:
ql_y = data_df.loc[test_X.index, ['ql_t', 'ql_t.1']]
ql_y['pred'] = 0
ql_y[ql_y['ql_t'] >= ql_y['ql_t.1']] = 1
metrics.accuracy_score(ql_y['pred'].as_matrix(), test_y)

0.5422757189313977

In [13]:
1- np.sum(test_y)/test_y.shape[0]

0.7979195107603858