In [None]:
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,precision_score,f1_score,recall_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM,GlobalMaxPooling1D,GRU,SpatialDropout1D
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from nltk.tokenize import word_tokenize
# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet
wordnet.synsets("subscribe")

from collections import OrderedDict



nltk.download('stopwords')

# DATASET
DATASET_COLUMNS = ["ThreadID", "rcontent", "Class" ]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.7

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC
W2V_SIZE = 100
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 100
EPOCHS = 5
BATCH_SIZE = 64


# EXPORT

WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

df=pd.read_excel('NYC2.xlsx')

print("Dataset size:", len(df))

df.head(5)



stop_words = stopwords.words("english")
stemmer = PorterStemmer()

def preprocess(text, stem=True):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

#AGUMENTATION####
def find_synonyms(word):
  synonyms = []
  for synset in wordnet.synsets(word):
    for syn in synset.lemma_names():
      synonyms.append(syn)

  # using this to drop duplicates while maintaining word order (closest synonyms comes first)
  synonyms_without_duplicates = list(OrderedDict.fromkeys(synonyms))
  return synonyms_without_duplicates

def create_set_of_new_sentences(sentence, max_syn_per_word = 3):
  new_sentences = []
  for word in word_tokenize(sentence):
    if len(word)<=3 : continue
    for synonym in find_synonyms(word)[0:max_syn_per_word]:
      synonym = synonym.replace('_', ' ') #restore space character
      new_sentence = sentence.replace(word,synonym)
      new_sentences.append(new_sentence)
  return new_sentences

def data_augment_synonym_replacement(data, column='subject'):
  generated_data = pd.DataFrame([], columns=data.columns)
  for index in data.index:
    text_to_augment = data[column][index]
    for generated_sentence in create_set_of_new_sentences(text_to_augment):
      new_entry =  data.loc[[index]]
      new_entry[column] = generated_sentence
      generated_data=generated_data.append(new_entry)

  generated_data_df = generated_data.drop_duplicates()
  augmented_data= pd.concat([data.loc[:],generated_data_df], ignore_index=True)
  return augmented_data
#AGUMENTATION#####


df.rcontent = df.rcontent.apply(lambda x: preprocess(x))

df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=30)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))
data_augment_synonym_replacement(df,column='rcontent')
df_train1=df_train.append(data_augment_synonym_replacement(df,column='rcontent'))

# Word2Vec
documents = [_text.split() for _text in df_train1.rcontent]

w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE,
                                            window=W2V_WINDOW,
                                            min_count=W2V_MIN_COUNT,
                                            workers=8)
w2v_model.build_vocab(documents)

words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)
text=df['rcontent'].values
#Tokenize Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

X_train = pad_sequences(tokenizer.texts_to_sequences(df_train1.rcontent), maxlen=SEQUENCE_LENGTH)
X_test = pad_sequences(tokenizer.texts_to_sequences(df_test.rcontent), maxlen=SEQUENCE_LENGTH)

Y_train= pd.get_dummies(df_train1.Class).values
Y_test= pd.get_dummies(df_test.Class).values



print("x_train", X_train.shape)
print("y_train", Y_train.shape)
print()
print("x_test", X_test.shape)
print("y_test", Y_test.shape)



#Embedding layer
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=X_train.shape[1], trainable=False)

model = Sequential()
model.add(embedding_layer)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.25, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=2),
              EarlyStopping(monitor='val_accuracy',mode='max', min_delta=1, patience=2)]
history = model.fit(X_train, Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                     validation_split=0.2,
                    verbose=1)



score = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

losst, accuracyt = model.evaluate(X_train, Y_train, verbose=False)
print("Training Accuracy: {:.4f}",accuracyt)

y_pred = np.argmax(model.predict(X_test),axis=1)

y_test_arg=np.argmax(Y_test,axis=1)
precision = precision_score(y_test_arg,y_pred,average='macro')

print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_arg,y_pred,average='macro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_arg,y_pred,average='macro')
print('F1 score: %f' % f1)




acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

model1 = Sequential()

model1.add(embedding_layer)
model1.add(Dropout(0.25))

model1.add(Conv1D(100,
                 5,
                 padding='valid',
                 activation='relu',
                 strides=1))

model1.add(GlobalMaxPooling1D())
model1.add(Flatten())

model1.add(Dense(400))
model1.add(Dropout(0.25))
model1.add(Activation('relu'))
model1.add(Dense(3))
model1.add(Activation('softmax'))

model1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model1.summary()

history1=model1.fit(X_train, Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.2,
                    verbose=1)


score1 = model1.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score1[1])
print("LOSS:",score1[0])


y_pred1 = np.argmax(model1.predict(X_test),axis=1)

y_test_arg1=np.argmax(Y_test,axis=1)
precision = precision_score(y_test_arg1,y_pred1,average='macro')

print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_arg1,y_pred1,average='macro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_arg1,y_pred1,average='macro')
print('F1 score: %f' % f1)

acc = history1.history['accuracy']
val_acc = history1.history['val_accuracy']
loss = history1.history['loss']
val_loss = history1.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()



model2 = Sequential()
model2.add(embedding_layer)
#model2.add(Dropout(0.5))
model2.add(GRU(100, dropout=0.25, recurrent_dropout=0.2))
model2.add(Dense(3, activation='softmax'))

model2.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

callbacks1 = [ ReduceLROnPlateau(monitor='val_loss', patience=3),
              EarlyStopping(monitor='val_accuracy',mode='max', min_delta=1, patience=3)]


model2.summary()

history2 = model2.fit(X_train, Y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.2,
                    verbose=1
                    )

score2 = model2.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score2[1])
print("LOSS:",score2[0])

acc = history2.history['accuracy']
val_acc = history2.history['val_accuracy']
loss = history2.history['loss']
val_loss = history2.history['val_loss']

y_pred2 = np.argmax(model2.predict(X_test),axis=1)

y_test_arg2=np.argmax(Y_test,axis=1)
precision = precision_score(y_test_arg2,y_pred2,average='macro')

print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test_arg2,y_pred2,average='macro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test_arg2,y_pred2,average='macro')
print('F1 score: %f' % f1)

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()