In [None]:
#importing the libraries 
import numpy as np
import re
import string
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [None]:
def clean(doc):
    """
    Cleaning the text: removing punctuation, stopwords and 
    making them all lower letter.
    """
    
    doc = re.sub(r"[^A-Za-z0-9^]", " ", doc)
    doc = [word for word in doc.lower().split() if not word in stopwords.words("english")]
    doc = " ".join(doc)
    return doc

def labels(data):
    tags = []
    for index, row in enumerate(data):
        tags.append(LabeledSentence(row.split(), ['doc_%d'% index]))
    return tags


In [None]:
def pre_process(path, size=300):
    """
    Function for pre-processing using Doc2Vec. 
    """
    
    data = pd.read_csv(path)
    data = data.dropna(subset=['text']).reset_index().drop(['index', 'id'], axis=1)
    
    data['text'] = data['text'].apply(lambda row: clean(row))

    X = labels(data['text'])
    y = data['label'].values

    model_t = Doc2Vec(min_count=1, vector_size = size, window=5, sample=1e-4, negative=5, 
                      workers=10, epochs=10, seed=0)
    model_t.build_vocab(X)
    model_t.train(X, total_examples=model_t.corpus_count, epochs=model_t.iter)

    X_train, X_test, y_train, y_test = train_test_split(model_t.docvecs, y, test_size=0.2, 
                                                        random_state=0, stratify=y)
    X_train, X_test, y_train, y_test = np.array(X_train), np.array(X_test), 
                                       np.array(y_train), np.array(y_test)

    return X_train, X_test, y_train, y_test

In [None]:
import matplotlib.pyplot as plt
import keras
from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding, Input, RepeatVector
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import scikitplot.plotters as skplt
import os

X_train, X_test, y_train, y_test = pre_process('train.csv')

In [None]:
def model():
    '''Initializing the neural network.'''
    model = Sequential()
    model.add(Dense(256, input_dim=300, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation='relu', kernel_initializer='normal'))
    model.add(Dropout(0.5))
    model.add(Dense(80, activation='relu', kernel_initializer='normal'))
    model.add(Dense(1, activation="sigmoid", kernel_initializer='normal'))

    #compiling a gradient descent algorithm
    model.compile(loss='binary_crossentropy', optimizer= Adam(lr=0.01), metrics='accuracy')
    return model

#setting the model
model = model()

In [None]:
estimator = model.fit(X_train, y_train, epochs=20, batch_size=64, verbose=0)
print("Model Trained!")

In [None]:
train = model.evaluate(X_train, y_train)
test = model.evaluate(X_test, y_test)

print("Train Set Accuracy: {}\nTest Set Accuracy: {} ".format(str(round(train[1]*100, 2)),
                                                              str(round(test[1]*100, 3))))

In [None]:
y_pred = [0 if x<0.5 else 1 for x in model.predict(X_test)]
skplt.plot_confusion_matrix(y_test, y_pred)