In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from matplotlib import pyplot 
import pickle

In [None]:
import tensorflow as tf
from tensorflow.keras import backend
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Input,GRU,LSTM,Embedding, Dropout, Activation 
from keras.activations import relu
from keras.models import Sequential ,Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.optimizers import Adam 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,log_loss,classification_report,confusion_matrix,multilabel_confusion_matrix 

## Checkpoint 0

In [None]:
new_data = pd.read_csv('../input/cleansed_data.csv')

In [None]:
new_data['annotation'].value_counts()

In [None]:
count=0
new_data['tidy_text']= new_data['tidy_text']
new_data['tidy_text']= new_data['tidy_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
new_data.head()

In [None]:
new_data['tidy_text']= new_data['tidy_text'].str.lower()

In [None]:
new_data.head()

In [None]:
def reduce_lengthening(text):
    text = text.lower()
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", str(text))

new_data['tidy_text'] = new_data['tidy_text'].map(lambda com : reduce_lengthening(com))

## Checkpoint 2 new_data

In [None]:
new_dataset = new_data.copy()

## Checkpoint 3

In [None]:
temp = []
for i in range(new_dataset.shape[0]):
    new_dataset.at[i,'annotation']= int(new_dataset.at[i,'annotation'][1])

#     new_dataset.at[i,'annotation']= 'troll' if (int(new_dataset.at[i,'annotation'][2]) == 1) else 'clean'
new_dataset

In [None]:
new_dataset.tidy_text.str.split(expand=True).stack().value_counts()

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('russian')
print(stopwords)

In [None]:
def stop_words(sen): 
    filtered_sentence = []
    for w in sen.split():
        if w in stopwords: 
            filtered_sentence.append(w)
    return " ".join(filtered_sentence)
new_dataset['tidy_text'] = new_dataset['tidy_text'].map(lambda com : stop_words(com))

In [None]:
new_dataset.head()

In [None]:
new_dataset.tidy_text.str.split(expand=True).stack().value_counts()

In [None]:
tokenizer = Tokenizer(num_words=100000,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(new_dataset["tidy_text"])

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer,handle)

In [None]:
X = new_dataset['tidy_text']
X = tokenizer.texts_to_sequences(X)

In [None]:
vocab_size = len(tokenizer.word_index)+1
print(vocab_size)

In [None]:
maxlen = 100
X = pad_sequences(X, padding='post', maxlen=maxlen)
X = np.asarray(X)
y = new_dataset['annotation']

In [None]:
X_train , X_val ,y_train, y_val = train_test_split(X,y,test_size=0.2,random_state =42)

In [None]:
print(X_train.shape)
print(type(X_train))

In [None]:
print(X_train,y_train)

In [None]:
embedding_dim = 64
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim,input_length = maxlen,mask_zero=True))

model.add(Dense(256,activation='tanh',kernel_initializer='glorot_uniform',
    bias_initializer='glorot_uniform',return_sequences=True))



model.add(Dense(32,activation='tanh',kernel_initializer='glorot_uniform',bias_initializer='glorot_uniform'))

model.add(Dense(32, activation='relu',kernel_initializer='he_uniform',bias_initializer='he_uniform'))

model.add(Dense(1, activation='sigmoid',kernel_initializer='he_uniform',
    bias_initializer='he_uniform'))

model.compile(loss='categorical_crossentropy',optimizer='sgd', metrics=['accuracy'])

print(model.summary())


In [None]:
troll=model.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=64,epochs=500,verbose=2)
model.save('troll_new01.h5')


In [None]:
model.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size=64,epochs=20,verbose=2,callbacks=[reduce_lr])

In [None]:
model.save('troll02.h5')

In [None]:
model.evaluate(X_train,y_train)

In [None]:
model.evaluate(X_val,y_val)

In [None]:

# plot loss during training
pyplot.subplot(211)
pyplot.title('Loss')
pyplot.plot(troll.history['loss'], label='train')
pyplot.plot(troll.history['val_loss'], label='test')
pyplot.legend()
# plot accuracy during training
pyplot.subplot(212)
pyplot.title('Accuracy')
pyplot.plot(troll.history['accuracy'], label='train')
pyplot.plot(troll.history['val_accuracy'], label='test')
pyplot.legend()
pyplot.show()
# plot loss during training
pyplot.subplot(211)
pyplot.title('Loss')
pyplot.plot(troll.history['loss'], label='train')
pyplot.plot(troll.history['val_loss'], label='test')
pyplot.legend()
# plot accuracy during training
pyplot.subplot(212)
pyplot.title('Accuracy')
pyplot.plot(troll.history['accuracy'], label='train')
pyplot.plot(troll.history['val_accuracy'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
model.save_weights('troll_new02_weights.h5')

In [None]:
model.summary()

In [None]:
from sklearn.externals import joblib 
  
joblib.dump(model, 'trollpickle.pkl') 

In [None]:
from keras.models import load_model

In [None]:
troll = load_model('troll_new02.h5')

In [None]:
troll.load_weights('troll02_weights.h5')

In [None]:
troll.predict(X)