### Classification of bing search queries into geographic and non geographic entities. A sample of the data is prelabelled for training classifier

### The notebook contains two parts:
* ML classfiers using Logisitic regression, Naive Bayes, SVM and XGBoost
* Deep learning Classifer using LSTM

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words_geo = stopwords.words('english')

In [3]:
#Filtering geographical stopwords from the NLTK Corpus
for i in ['as','at','by','between','to','from','in', 'off', 'there','where']:
    stop_words_geo.remove(i)

test_df_1 = pd.read_csv('./labelled_500.csv', usecols=['Query','geo'], nrows=499)
test_df_2 = pd.read_csv('./labelled_1000_correct.csv', usecols=['Query','geo'], nrows=1500)
test_df = test_df_1.append(test_df_2[499:1500])

df_nongeo_sampled = test_df[test_df['geo'] == 0].sample(n=319, frac=None, \
                                replace=False, weights=None, random_state=None, axis=None)

test_df_sampled = df_nongeo_sampled.append(test_df[test_df.geo == 1])

In [7]:
#Splitting data into train - validation split)
xtrain, xvalid, ytrain, yvalid = train_test_split(test_df_sampled.Query.values, test_df_sampled.geo.values, 
                                                  stratify=test_df_sampled.geo.values, 
                                                  random_state=42, 
                                                  test_size=0.2)

### Classifying search queries using logistic regression, naivebayes, SVM and Xgboost

#### Logistic regression using TfidfVectorizer

In [8]:
#TfidfVectorizer 
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = stop_words_geo)

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [10]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print metrics.accuracy_score(yvalid,np.argmax(predictions, axis=1))

0.765625


In [12]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.83
Average recall score: 0.67


#### Logistic Regression using CountVectorizer

In [13]:
#CountVectorizer
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = stop_words_geo)

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [14]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
print metrics.accuracy_score(yvalid,np.argmax(predictions, axis=1))

0.7265625


In [15]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.80
Average recall score: 0.61


#### Simple Naive Bayes using Tfidf

In [16]:
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print metrics.accuracy_score(yvalid,np.argmax(predictions, axis=1))

0.7421875


In [17]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.81
Average recall score: 0.62


#### Naive Bayes using CountVectorizer

In [18]:
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
print metrics.accuracy_score(yvalid,np.argmax(predictions, axis=1))

0.71875


In [19]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.79
Average recall score: 0.73


#### SVM using singular-value decompositionto preprocess the data 

In [20]:
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [21]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)
print metrics.accuracy_score(yvalid,np.argmax(predictions, axis=1))

0.734375


In [22]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.80
Average recall score: 0.80


####  XGBoost for classification

In [23]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print metrics.accuracy_score(yvalid,np.argmax(predictions, axis=1))

0.7265625


In [24]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.80
Average recall score: 0.75


In [25]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)
print metrics.accuracy_score(yvalid,np.argmax(predictions, axis=1))

0.75


In [26]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.81
Average recall score: 0.75


In [27]:
mll_scorer = metrics.make_scorer(np.mean(yvalid == np.argmax(predictions, axis=1)), \
                                 greater_is_better=False, needs_proba=True)

In [28]:
# load the GloVe vectors in a dictionary:
embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [02:52, 12740.97it/s]

Found 2196016 word vectors.





In [29]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words_geo]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [30]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

100%|██████████| 510/510 [00:00<00:00, 1983.09it/s]
100%|██████████| 128/128 [00:00<00:00, 2876.21it/s]


In [31]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [32]:
# Fitting a xgboost on glove features
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)
np.mean(yvalid == np.argmax(predictions, axis=1))

0.8515625

In [33]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.89
Average recall score: 0.83


In [34]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)
np.mean(yvalid == np.argmax(predictions, axis=1))

0.8515625

In [35]:
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.89
Average recall score: 0.84


In [614]:
our_test = np.array(['cafe near google mountain view'])
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xtest_glove = [sent2vec(x) for x in tqdm(our_test)]

100%|██████████| 800/800 [00:00<00:00, 3372.81it/s]
100%|██████████| 1/1 [00:00<00:00, 758.74it/s]


In [615]:
predictions = clf.predict_proba(xtest_glove)

In [616]:
np.argmax(predictions, axis=1)

array([0])

### Deep Learning using LSTM

In [37]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [38]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [39]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='softmax'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(2))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [40]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

Train on 510 samples, validate on 128 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13bd23d50>

In [44]:
#Using Keras tokenizer
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [45]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 1507/1507 [00:00<00:00, 136501.02it/s]


In [47]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

# Fit the model with early stopping callback
earlystop = EarlyStopping(min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 510 samples, validate on 128 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


<keras.callbacks.History at 0x1240d9210>

### Accuracy is 89 percent

In [48]:
predictions = model.predict(xvalid_pad)
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.91
Average recall score: 0.95


In [49]:
# A simple bidirectional LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 510 samples, validate on 128 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.callbacks.History at 0x13e8fe650>

In [50]:
predictions = model.predict(xvalid_pad)
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.88
Average recall score: 0.94


In [51]:
# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(2))
model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='acc', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, callbacks=[earlystop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


<keras.callbacks.History at 0x13d45aa10>

In [52]:
predictions = model.predict(xvalid_pad)
from sklearn.metrics import average_precision_score, recall_score
average_precision = average_precision_score(yvalid, np.argmax(predictions, axis=1))
average_recall_score = recall_score(yvalid, np.argmax(predictions, axis=1))

print('Average precision score: {0:0.2f}'.format(
      average_precision))
print('Average recall score: {0:0.2f}'.format(
      average_recall_score))

Average precision score: 0.90
Average recall score: 0.94


### As it can be seen, LSTM is providing the highest precision and recall combination on the particular dataset. If we focus on non neural net, boosting using xgboost provides the best precision, recall combination.

### Additionally, the model will train better if trained on a larger labelled dataset hence after labelling the data the performance of the classifier will improve.