In [None]:
import os
import re
import spacy
import nltk
import numpy as np
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from scipy.sparse import csr_matrix
from sklearn import metrics

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
os.chdir('/gdrive/My Drive/drivebuddy_text_classification')

In [None]:
import glob

In [None]:
def filter_query(query):
    query = query.lower()
    query = re.sub(r'[@][^\s]+', '', query)
    query = re.sub(r'pav.{0,3}bhaji', ' pavbhaji ', query)
    query = re.sub(r'[\!-\/\:-\@]+', ' ', query)
    query = re.sub('[^A-Za-z0-9\s]+', ' ', query)
    query = re.sub(r'[\t\n\r\f ]+', ' ', re.sub(r'\.', '. ', query))
    query = ' '.join([w for w in query.split() if w not in stopwords.words('english')])
    
    # print (query)
    # doc = nlp(query)
    # tokens = [lemmatizer.lemmatize(t) for t in tokens]
    # filt_q = ' '.join(tokens)
    filt_q = re.sub(r'\b(n\'t|nt)\b', 'not', query)
    filt_q = re.sub(r'\'ll\b', 'will', filt_q)
    return filt_q

In [None]:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
import json

In [None]:
with open('dataset_mod/pavbhaji.json', 'r') as f:
    data = json.load(f)

In [None]:
# indexing json data with filename
indexed_data = {d['display_url'].split('/')[-1]: d['edge_media_to_caption']['edges'][0]['node']['text'] for d in data if d['edge_media_to_caption']['edges']}

In [None]:
file_names0_set = set([f.split('/')[-1] for f in glob.glob('dataset_mod/images/0/*.jpg')])
file_names1_set = set([f.split('/')[-1] for f in glob.glob('dataset_mod/images/1/*.jpg')])

In [None]:
# dataframe with columns (filename, text, label)
data_with_labels = pd.DataFrame([{'name': name, 'text': indexed_data[name], 'label': 1 if name in file_names1_set else 0} for name in file_names0_set | file_names1_set])

In [None]:
processed_text = data_with_labels['text'].map(filter_query)
df = pd.DataFrame({'name':data_with_labels['name'], 'text':processed_text, 'label': data_with_labels['label']})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2)

In [None]:
# creating BOW model with ngrams lengths ranging from 1 to 5 tokens
bow_vector = CountVectorizer(tokenizer=word_tokenize, ngram_range=(1,5))

In [None]:
classifier = LogisticRegression()

In [None]:
pipe = Pipeline([('vectorizer', bow_vector), ('classifier', classifier)])

In [None]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 5), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function word_tokenize at 0x7f5cbb5d47a0>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
           

In [None]:
predicted = pipe.predict(X_test)

# Model Accuracy of 72% achieved
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.7252747252747253
Logistic Regression Precision: 0.6326530612244898
Logistic Regression Recall: 0.8157894736842105


In [None]:
# using multi layer perceptron classifier model
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(16, 8))

In [None]:
pipe1 = Pipeline([('vectorizer', bow_vector), ('classifier', clf)])

In [None]:
pipe1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 5), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function word...
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(16, 8),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
    

In [None]:
predicted = pipe1.predict(X_test)

# Model Accuracy of 69% achieved
print("MLP Accuracy:",metrics.accuracy_score(y_test, predicted))

MLP Accuracy: 0.6923076923076923


In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(64, 64) )
# clf = MLPClassifier(solver='adam', 
#                       hidden_layer_sizes=(64, 64), 
#                       learning_rate='adaptive', 
#                       max_iter=1000, 
#                       early_stopping=True, 
#                       n_iter_no_change=20)

In [None]:
pipe1 = Pipeline([('vectorizer', bow_vector), ('classifier', clf)])

In [None]:
pipe1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 5), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function word...
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(64, 64),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
   

In [None]:
predicted = pipe1.predict(X_test)

# Model Accuracy of 72% achieved
print("MLP Regression Accuracy:",metrics.accuracy_score(y_test, predicted))

MLP Regression Accuracy: 0.7252747252747253


In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 64))

In [None]:
pipe1 = Pipeline([('vectorizer', bow_vector), ('classifier', clf)])

In [None]:
pipe1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 5), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function word...
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(128, 64),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
  

In [None]:
predicted = pipe1.predict(X_test)

# Model Accuracy of 71% achieved
print("MLP Accuracy:",metrics.accuracy_score(y_test, predicted))

MLP Accuracy: 0.7142857142857143


In [None]:
# BOW vectorizer with ngrams range of 1 to 3
bow_vector = CountVectorizer(tokenizer=word_tokenize, ngram_range=(1,3))

In [None]:
pipe1 = Pipeline([('vectorizer', bow_vector), ('classifier', clf)])

In [None]:
pipe1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function word...
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(128, 64),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
  

In [None]:
predicted = pipe1.predict(X_test)

# Model Accuracy of 73% aqchieved
print("MLP Accuracy:", metrics.accuracy_score(y_test, predicted))

MLP Accuracy: 0.7362637362637363


In [None]:
# trying out a simpler model
clf_tree = DecisionTreeClassifier()

pipe_tree = Pipeline([('vectorizer', bow_vector), ('classifier', clf_tree)])

pipe_tree.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function word...0>,
                                 vocabulary=None)),
                ('classifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                      

In [None]:
predicted = pipe_tree.predict(X_test)

# Model Accuracy of 65% achieved
print("Decision Tree Accuracy:",metrics.accuracy_score(y_test, predicted))

Decision Tree Accuracy: 0.6593406593406593


In [None]:
# BOW vectorizer with ngrams ranging from 1 to 5
bow_vectorizer = CountVectorizer(tokenizer=word_tokenize, ngram_range=(1,5))

In [None]:
bow_vectorizer.fit(df['text'])



CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 5), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function word_tokenize at 0x7f5cbb5d47a0>,
                vocabulary=None)

In [None]:
from scipy.sparse import csr_matrix

def create_ngram_with_position(text, n):
    # tokens = tokenize_hing(text)
    tokens = word_tokenize(text)
    position_dict = {} # vocab_ind: position
    for i in range(n):
        for j in range(len(tokens) - i):
            vocab_ind = bow_vectorizer.vocabulary_[' '.join(tokens[j : j + 1 + i])]
            if vocab_ind not in position_dict:
                position_dict.update({vocab_ind: j})
    return position_dict

# function to vectorize text with the position of the element in the vocabulary index
# this will help to capture relative positional information to try and classify the data points
def transform_position_vector(text_series, ngram_size):
    row = []
    col = []
    mat_data = []
    for i, t in enumerate(text_series):
        position_dict = create_ngram_with_position(t, ngram_size)
        for ind, pos in position_dict.items():
            row.append(i)
            col.append(ind)
            mat_data.append(pos)
    return csr_matrix((mat_data, (row, col)), shape=(len(text_series), len(bow_vectorizer.vocabulary_)))

In [None]:
print (transform_position_vector(df['text'][0:3], 5)[0])

  (0, 3598)	4
  (0, 3599)	10
  (0, 3600)	10
  (0, 3601)	10
  (0, 3602)	10
  (0, 3603)	4
  (0, 3604)	4
  (0, 3605)	4
  (0, 3606)	4
  (0, 7568)	8
  (0, 7569)	8
  (0, 7570)	8
  (0, 7571)	8
  (0, 7572)	8
  (0, 7623)	21
  (0, 7636)	21
  (0, 7637)	21
  (0, 7638)	21
  (0, 7639)	21
  (0, 8154)	11
  (0, 8155)	11
  (0, 8156)	11
  (0, 8157)	11
  (0, 8158)	11
  (0, 8298)	15
  :	:
  (0, 31443)	26
  (0, 31565)	9
  (0, 31566)	9
  (0, 31567)	9
  (0, 31568)	9
  (0, 31569)	9
  (0, 35111)	1
  (0, 35112)	1
  (0, 35113)	1
  (0, 35114)	1
  (0, 35115)	1
  (0, 36592)	2
  (0, 37581)	6
  (0, 37582)	6
  (0, 37583)	6
  (0, 37584)	6
  (0, 37827)	2
  (0, 37831)	2
  (0, 37832)	2
  (0, 37833)	2
  (0, 47967)	0
  (0, 48005)	0
  (0, 48006)	0
  (0, 48007)	0
  (0, 48008)	0


In [None]:
X_train_mat = transform_position_vector(X_train, 5)

In [None]:
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(64, 64), learning_rate='adaptive', max_iter=1000, early_stopping=True, n_iter_no_change=20)

In [None]:
clf.fit(X_train_mat.toarray(), y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(64, 64), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=1000,
              momentum=0.9, n_iter_no_change=20, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [None]:
X_test_mat = transform_position_vector(X_test, 5)

In [None]:
predicted = clf.predict(X_test_mat.toarray())

# Model Accuracy of 69% achieved
print("Accuracy:",metrics.accuracy_score(y_test, predicted))

Accuracy: 0.6923076923076923


In [None]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train_mat.toarray(), y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
predicted = clf_tree.predict(X_test_mat.toarray())

# Model Accuracy of 68% achieved
print("Accuracy:",metrics.accuracy_score(y_test, predicted))

Accuracy: 0.6813186813186813


In [None]:
import tensorflow as tf
# import tensorflow_hub as hub
import numpy as np
import os
import pandas as pd
import re
import json

from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda, Layer, concatenate, Concatenate, Reshape, Conv2D, Conv1D, Masking
from tensorflow.keras.models import Model, load_model, model_from_json
import numpy as np
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda, Bidirectional
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub


In [None]:
# implementing a dense layer for positional features using keras

input_vect = Input(shape=(len(bow_vectorizer.vocabulary_),), dtype=tf.int64)
# embedding = Lambda(transform_position_vector,output_shape=(embed_size,))(input_text)
dense1 = Dense(128, activation='relu')(input_vect)
dense2 = Dense(64, activation='relu')(dense1)
out = Dense(1, activation='sigmoid')(dense2)
model = Model(inputs=[input_vect], outputs=out)

LEARNING_RATE = 0.001

optimizer = Adam(lr=LEARNING_RATE)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:

class LearningRateTracker(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(" - lr: {}".format(K.eval(self.model.optimizer.lr))) 

LR_PATIENCE = 10
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=LR_PATIENCE, min_lr=1e-8, verbose=1, mode="min")
es_callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
lr_tracker = LearningRateTracker()

In [None]:
history = model.fit(X_train_mat.toarray(), 
          y_train,
          validation_data=(X_test_mat.toarray(), y_test),
          epochs=1000,
          batch_size=256,
          callbacks=[es_callback, lr_tracker, reduce_lr])

# achieved accuracy of 74.7%

Epoch 1/1000
 - lr: 0.0010000000474974513
Epoch 2/1000
 - lr: 0.0010000000474974513
Epoch 3/1000
 - lr: 0.0010000000474974513
Epoch 4/1000
 - lr: 0.0010000000474974513
Epoch 5/1000
 - lr: 0.0010000000474974513
Epoch 6/1000
 - lr: 0.0010000000474974513
Epoch 7/1000
 - lr: 0.0010000000474974513
Epoch 8/1000
 - lr: 0.0010000000474974513
Epoch 9/1000
 - lr: 0.0010000000474974513
Epoch 10/1000
 - lr: 0.0010000000474974513
Epoch 11/1000
 - lr: 0.0010000000474974513

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 12/1000
 - lr: 0.00020000000949949026
Epoch 13/1000
 - lr: 0.00020000000949949026
Epoch 14/1000
 - lr: 0.00020000000949949026
Epoch 15/1000
 - lr: 0.00020000000949949026
Epoch 16/1000
 - lr: 0.00020000000949949026
Epoch 17/1000
 - lr: 0.00020000000949949026
Epoch 18/1000
 - lr: 0.00020000000949949026
Epoch 19/1000
 - lr: 0.00020000000949949026
Epoch 20/1000
 - lr: 0.00020000000949949026
Epoch 21/1000
 - lr: 0.00020000000949949026

Epoch 00021: 