**Spam Filter for Quora Questions**

In [1]:
%tensorflow_version 2.x

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [2]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,  Conv1D
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.python.keras.layers import CuDNNGRU

In [3]:
train_df = pd.read_csv("/content/train.csv")
print("Train shape : ",train_df.shape)

Train shape :  (1306122, 3)


In [4]:
target_types = train_df.groupby('target').agg('count')
target_types

Unnamed: 0_level_0,qid,question_text
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1225312,1225312
1,80810,80810


In [5]:
target_labels = train_df.target.sort_values().index
target_counts = train_df.target.sort_values()

In [6]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
eng_stopwords = stopwords.words('english')
eng_stopwords.remove('not') #remove not from the words as it is negative
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def data_preprocessing(questions):
    #data cleaning
    questions = re.sub(re.compile('<.*?>'),'',questions)
    questions = re.sub('[^A-Za-z0-9]+',' ',questions)
    
    #Lowercase
    questions = questions.lower()

    #tokenization
    tokens = nltk.word_tokenize(questions)

    #stop words removal
    questions = [word for word in tokens if word not in eng_stopwords] #remove stop wprds

    #lemmatization
    questions = [lemmatizer.lemmatize(word) for word in questions]

    #join words in preprocessed questions
    questions = ' '.join(questions)

    return questions

In [10]:
train_df['preprocessed_question_text']=train_df["question_text"].apply(lambda question_text: data_preprocessing(question_text))
train_df.head()

Unnamed: 0,qid,question_text,target,preprocessed_question_text
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,quebec nationalist see province nation 1960s
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,adopted dog would encourage people adopt not shop
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,velocity affect time velocity affect space geo...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,otto von guericke used magdeburg hemisphere
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,convert montra helicon mountain bike changing ...


In [11]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=2018)

In [12]:
## fill up the missing values in the question_text with "_na_"
train_df["question_text"] = train_df["question_text"].fillna("_na_").values
val_df["question_text"] = val_df["question_text"].fillna("_na_").values

In [13]:
## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vect= CountVectorizer(dtype=np.float32,strip_accents='unicode',
                      analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1,3), min_df = 3)
X_train = vect.fit_transform(list(train_df['preprocessed_question_text'].values))
X_val = vect.transform(val_df['preprocessed_question_text'].values)

In [16]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.metrics import accuracy_score,f1_score

In [17]:
clf=MultinomialNB()
clf.fit(X_train,train_y)

MultinomialNB()

In [18]:
y_val = clf.predict(X_val)
print("Validation accuracy: ",accuracy_score(val_y,y_val))
print("Validation f1_score: ",f1_score(val_y,y_val))

Validation accuracy:  0.9270640598003762
Validation f1_score:  0.5412606943931684


In [19]:
del X_train,vect,X_val
import gc; gc.collect()
time.sleep(10)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tfidfvec= TfidfVectorizer(dtype=np.float32,strip_accents='unicode',
                      analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1,3), min_df = 3,
                      max_features=None,use_idf=1,smooth_idf=1,sublinear_tf=1,stop_words='english')
X_train_tfidf = tfidfvec.fit_transform(list(train_df['preprocessed_question_text'].values) )
X_val_tfidf = tfidfvec.transform(val_df['preprocessed_question_text'].values)

In [22]:
clf=BernoulliNB()
clf.fit(X_train_tfidf,train_y)

BernoulliNB()

In [23]:
y_val = clf.predict(X_val_tfidf)
print("Validation accuracy: ",accuracy_score(val_y,y_val))
print("Validation f1_score: ",f1_score(val_y,y_val))

Validation accuracy:  0.9379461357656372
Validation f1_score:  0.5113643214565623


In [24]:
del X_train_tfidf,tfidfvec,X_val_tfidf
import gc; gc.collect()
time.sleep(10)

In [25]:
from sklearn.feature_extraction.text import HashingVectorizer

In [26]:
hashvec= HashingVectorizer(dtype=np.float32,strip_accents='unicode',
                      analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1,3),n_features = 2**10)
X_train_hashvec = hashvec.fit_transform(list(train_df['preprocessed_question_text'].values))
X_val_hashvec = hashvec.transform(val_df['preprocessed_question_text'].values)

In [27]:
clf=GaussianNB()
clf.fit(X_train_hashvec.toarray(),train_y)

GaussianNB()

In [28]:
y_val = clf.predict(X_val_hashvec.toarray())
print("Validation accuracy: ",accuracy_score(val_y,y_val))
print("Validation f1_score: ",f1_score(val_y,y_val))

Validation accuracy:  0.7236019058945429
Validation f1_score:  0.22872647253615908


In [29]:
clf=BernoulliNB()
clf.fit(X_train_hashvec,train_y)

BernoulliNB()

In [30]:
y_val = clf.predict(X_val_hashvec)
print("Validation accuracy: ",accuracy_score(val_y,y_val))
print("Validation f1_score: ",f1_score(val_y,y_val))

Validation accuracy:  0.8969163197962418
Validation f1_score:  0.26971614536250227


In [31]:
del X_train_hashvec,hashvec,X_val_hashvec
import gc; gc.collect()
time.sleep(10)

In [32]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_df["question_text"]))
train_X = tokenizer.texts_to_sequences(train_df["question_text"])
val_X = tokenizer.texts_to_sequences(val_df["question_text"])

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)

In [36]:
import tensorflow as tf

In [46]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 100)]             0         
                                                                 
 embedding_11 (Embedding)    (None, 100, 300)          15000000  
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         186880    
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 16)                2064      
                                                                 
 dropout (Dropout)           (None, 16)                0     

In [47]:
## Train the model 
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb6c20bc7f0>

In [48]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5757985325757158
F1 score at threshold 0.11 is 0.5839391330124359
F1 score at threshold 0.12 is 0.5915996425379804
F1 score at threshold 0.13 is 0.5975710168793742
F1 score at threshold 0.14 is 0.6033352294841473
F1 score at threshold 0.15 is 0.6083885209713024
F1 score at threshold 0.16 is 0.6131122693598527
F1 score at threshold 0.17 is 0.6171556002261448
F1 score at threshold 0.18 is 0.6204918163418172
F1 score at threshold 0.19 is 0.6239513795723083
F1 score at threshold 0.2 is 0.6270892049551026
F1 score at threshold 0.21 is 0.6303925636982323
F1 score at threshold 0.22 is 0.6330881981724247
F1 score at threshold 0.23 is 0.6339850341759422
F1 score at threshold 0.24 is 0.6356139806093226
F1 score at threshold 0.25 is 0.6365603406156208
F1 score at threshold 0.26 is 0.6382166955170099
F1 score at threshold 0.27 is 0.6399699527829447
F1 score at threshold 0.28 is 0.6414316702819957
F1 score at threshold 0.29 is 0.6427239147130893
F1 score at threshold 

In [49]:
del model, inp, x
import gc; gc.collect()
time.sleep(10)

**Using Glove embeddings to rebuild GRU model**

In [50]:
!wget 'https://nlp.stanford.edu/data/glove.840B.300d.zip'

--2023-02-23 10:25:42--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2023-02-23 10:25:43--  https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: ‘glove.840B.300d.zip’


2023-02-23 10:32:33 (5.06 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]



In [51]:
!unzip glove.840B.300d.zip

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [52]:
!rm glove.840B.300d.zip

In [53]:
EMBEDDING_FILE = 'glove.840B.300d.txt'
def get_coefs(word,*arr): 
  return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

In [54]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

  if (await self.run_code(code, result,  async_=asy)):


In [55]:
del all_embs

In [56]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [58]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 100)]             0         
                                                                 
 embedding_13 (Embedding)    (None, 100, 300)          15000000  
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 128)         186880    
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 16)                2064      
                                                                 
 dropout_1 (Dropout)         (None, 16)                0   

In [59]:
model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))



<keras.callbacks.History at 0x7fb6c7b01100>

In [60]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5865264487431184
F1 score at threshold 0.11 is 0.5954688005400313
F1 score at threshold 0.12 is 0.6031107728101797
F1 score at threshold 0.13 is 0.6096690175880133
F1 score at threshold 0.14 is 0.6160973647752834
F1 score at threshold 0.15 is 0.6214168838252228
F1 score at threshold 0.16 is 0.6265731253269331
F1 score at threshold 0.17 is 0.6308969995941178
F1 score at threshold 0.18 is 0.6352061823018941
F1 score at threshold 0.19 is 0.6388447653429603
F1 score at threshold 0.2 is 0.6424821623027288
F1 score at threshold 0.21 is 0.6451039747301922
F1 score at threshold 0.22 is 0.6477846046256056
F1 score at threshold 0.23 is 0.6501481281982225
F1 score at threshold 0.24 is 0.6521887010645896
F1 score at threshold 0.25 is 0.6538507832247193
F1 score at threshold 0.26 is 0.6546945013720519
F1 score at threshold 0.27 is 0.6564748832800926
F1 score at threshold 0.28 is 0.6578033042615047
F1 score at threshold 0.29 is 0.659257668492072
F1 score at threshold 0

In [61]:
del word_index, embeddings_index, embedding_matrix, model, inp, x
import gc; gc.collect()
time.sleep(10)

Using FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuilding the model.

In [62]:
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip'

--2023-02-23 12:11:08--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2023-02-23 12:11:23 (45.2 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]



In [63]:
!unzip wiki-news-300d-1M.vec.zip

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [64]:
!rm wiki-news-300d-1M.vec.zip

In [65]:
EMBEDDING_FILE2 = 'wiki-news-300d-1M.vec'
def get_coefs(word,*arr): 
  return word, np.asarray(arr, dtype='float32')
embeddings_index2 = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE2) if len(o)>100)

In [66]:
all_embs2 = np.stack(embeddings_index2.values())
emb_mean2,emb_std2 = all_embs2.mean(), all_embs2.std()
embed_size2 = all_embs2.shape[1]

  if (await self.run_code(code, result,  async_=asy)):


In [67]:
del all_embs2

In [68]:
word_index2 = tokenizer.word_index
nb_words2 = min(max_features, len(word_index2))
embedding_matrix2 = np.random.normal(emb_mean2, emb_std2, (nb_words2, embed_size2))
for word, i in word_index2.items():
    if i >= max_features: continue
    embedding_vector2 = embeddings_index2.get(word)
    if embedding_vector2 is not None: embedding_matrix2[i] = embedding_vector2

In [69]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size2, weights=[embedding_matrix2])(inp)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [70]:
model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))



<keras.callbacks.History at 0x7fb6c24e1910>

In [71]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.59587444978279
F1 score at threshold 0.11 is 0.603775812951127
F1 score at threshold 0.12 is 0.6108686789375934
F1 score at threshold 0.13 is 0.617299181149475
F1 score at threshold 0.14 is 0.622688388476538
F1 score at threshold 0.15 is 0.6276231057572166
F1 score at threshold 0.16 is 0.6319718355771424
F1 score at threshold 0.17 is 0.6363982580762672
F1 score at threshold 0.18 is 0.6401358485490433
F1 score at threshold 0.19 is 0.6441673783091375
F1 score at threshold 0.2 is 0.646633740577073
F1 score at threshold 0.21 is 0.6486287179127015
F1 score at threshold 0.22 is 0.6502250929731846
F1 score at threshold 0.23 is 0.6514067371987815
F1 score at threshold 0.24 is 0.6526611666788348
F1 score at threshold 0.25 is 0.653877400295421
F1 score at threshold 0.26 is 0.6546045261035177
F1 score at threshold 0.27 is 0.6546622579121398
F1 score at threshold 0.28 is 0.6547352721849368
F1 score at threshold 0.29 is 0.6546065982489708
F1 score at threshold 0.3 is 

In [72]:
del word_index2, embeddings_index2,  embedding_matrix2, model, inp, x
import gc; gc.collect()
time.sleep(10)

**Observations:**



* Overall pretrained embeddings seem to give better results comapred to non-pretrained model.

* The performance of the different pretrained embeddings are almost similar.



In [73]:
pred_val_y = 0.70*pred_glove_val_y + 0.30*pred_fasttext_val_y 
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.5896088087201904
F1 score at threshold 0.11 is 0.5987329752405788
F1 score at threshold 0.12 is 0.6069245255685933
F1 score at threshold 0.13 is 0.6144754531889431
F1 score at threshold 0.14 is 0.6201228323699421
F1 score at threshold 0.15 is 0.6255799353840971
F1 score at threshold 0.16 is 0.6310821755653854
F1 score at threshold 0.17 is 0.6355270231807976
F1 score at threshold 0.18 is 0.6396636389896332
F1 score at threshold 0.19 is 0.6425342309818844
F1 score at threshold 0.2 is 0.6459393999307993
F1 score at threshold 0.21 is 0.649735696776668
F1 score at threshold 0.22 is 0.652322242864381
F1 score at threshold 0.23 is 0.6545448331254166
F1 score at threshold 0.24 is 0.6553088552915767
F1 score at threshold 0.25 is 0.6572896281800391
F1 score at threshold 0.26 is 0.6585633016501183
F1 score at threshold 0.27 is 0.6601393603716277
F1 score at threshold 0.28 is 0.6621043318105486
F1 score at threshold 0.29 is 0.6626393882430237
F1 score at threshold 0.