In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

%matplotlib inline
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

import gensim
from gensim.models.doc2vec import TaggedDocument

import scikitplot.plotters as skplt

import nltk

from xgboost import XGBClassifier

import os

import re
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dropout, GlobalAveragePooling1D, SpatialDropout1D, Reshape
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Input

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = '/kaggle/input/nlp-getting-started/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
sub = pd.read_csv(path+'sample_submission.csv')

In [None]:
clean_txt = []
for w in range(len(train.text)):
   desc = train['text'][w].lower()
   #remove html
   desc = re.sub(r'https?://\S+|www\.\S+', ' ', desc)

   #remove punctuation
   desc = re.sub('[^a-zA-Z]', ' ', desc)

   #remove tags
   desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)

   #remove digits and special chars
   desc=re.sub("(\\d|\\W)+"," ",desc)
   clean_txt.append(desc)

train['clean'] = clean_txt
train.head()

In [None]:
clean_txt[38]

In [None]:
corpus = []
for col in train.clean:
   word_list = col.split(" ")
   corpus.append(word_list)

#show first value
corpus[0:1]

In [None]:
Train, Test = train_test_split(train, test_size = 0.1, random_state = 42, stratify=train['target'].values)

import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

Train_tagged = Train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean']), tags=[r.target]), axis=1)
Test_tagged = Test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['clean']), tags=[r.target]), axis=1)

In [None]:
Train_tagged.values[13]

In [None]:
model = gensim.models.Doc2Vec(dm=0,min_count=1, vector_size=42, negative = 5)
model.build_vocab([x for x in tqdm(Train_tagged.values)])

In [None]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

In [None]:
y_train, X_train = vec_for_learning(model, Train_tagged)
y_test, X_test = vec_for_learning(model, Test_tagged)

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

In [None]:
embed_dim = 128
lstm_out = 196

# Model saving callback
ckpt_callback = ModelCheckpoint('keras_model', 
                                 monitor='val_loss', 
                                 verbose=1, 
                                 save_best_only=True, 
                                 mode='auto')

model = Sequential()
model.add(Embedding(num_words, embed_dim, input_length = X.shape[1]))
model.add(LSTM(lstm_out, recurrent_dropout=0.2, dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy'])
print(model.summary())

In [None]:
#embedding_matrix = np.zeros((num_words, 42))

with tf.device('/gpu:0'):  
    model = Sequential()

    ## Embedding Layer:
    ## Takes each word in the sequence and turns it into a dense vector of fixed size. 
    ## The output of this layer is a 2D vector where the first dimension is the sequence,
    ## and the second dimension is the dense vector representing each word.
    embedding = Embedding(42, 
                        42,
                         input_length=42,
                         #embeddings_initializer=Constant(embedding_matrix), 
                         trainable=False)

    model.add(embedding)
    
    ## Hidden Layers
    model.add(Dropout(0.3))  # dropout layer
    model.add(LSTM(256, return_sequences=True))  # change to return_sequences=True

    model.add(Dropout(0.3))  # dropout layer
    model.add(Bidirectional(LSTM(256, return_sequences=False)))  # LSTM inside Bidirectional outputs a 2D tensor

    model.add(Dropout(0.3))  # dropout layer
    model.add(Dense(256, activation='relu'))  # dense layer

    # Adding a Reshape layer to get back to 3D tensor for the next LSTM layer
    model.add(Reshape((1, 256)))  # this needs to be adjusted based on previous layer output dimensions
    
    model.add(Dropout(0.3))  # dropout layer
    model.add(LSTM(128, return_sequences=True))  # change to return_sequences=True

    model.add(Dropout(0.3))  # dropout layer
    model.add(Bidirectional(LSTM(128, return_sequences=False)))  # LSTM inside Bidirectional outputs a 2D tensor

    model.add(Dropout(0.3))  # dropout layer
    model.add(Dense(128, activation='relu'))  # dense layer

    model.add(Dropout(0.3))  # dropout layer
    ## Output Layer (Classification Layer)
    model.add(Dense(1, activation='sigmoid'))

    
    
    optimizer = Adam(learning_rate=1e-5)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    history = model.fit(np.array(X_train), 
                        np.array(y_train), 
                        batch_size=4, 
                        epochs=14, 
                        #validation_data=(list(X_test), list(y_test)), 
                        verbose=2)

In [None]:
np.array(y_train)