In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Exploratory Data Analysis**

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_train = pd.read_csv('../input/yelp-train/train.csv',header=None)

In [None]:
data_train.columns = ['deceptive','text']

In [None]:
data_train.head(5)

In [None]:
data_test = pd.read_csv('../input/yelptest/test.csv',header=None)

In [None]:
data_test.columns = ['deceptive','text']

In [None]:
data_test.head(5)

In [None]:
data_train.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.countplot(data_train.deceptive)
plt.xlabel('Deceptive')
plt.title('Number of Deceptive and Non Deceptive reviews (Deceptive=1 & NonDeceptive=2)')

In [None]:
#dataset description
data_train.groupby('deceptive').describe()

In [None]:
#word count
data_train['word_count'] = data_train['text'].apply(lambda x: len(str(x).split(" ")))
data_train[['text','word_count']].head()

In [None]:
#character count including spaces
data_train['char_count'] = data_train['text'].str.len() ## this also includes spaces
data_train[['text','char_count']].head()

In [None]:
#average word length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

data_train['avg_word'] = data_train['text'].apply(lambda x: avg_word(x))
data_train[['text','avg_word']].head()

In [None]:
#no of stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

data_train['stopwords'] = data_train['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
data_train[['text','stopwords']].head()

In [None]:
#no of special characters
data_train['spchar'] = data_train['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data_train[['text','spchar']].head()

In [None]:
#no of numerics
data_train['numerics'] = data_train['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data_train[['text','numerics']].head()

In [None]:
#no of uppercase characters
data_train['upper'] = data_train['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data_train[['text','upper']].head()

**Preprocessing**

In [None]:
#to lowercase
data_train['text'] = data_train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data_train['text'].head()

In [None]:
#removing punctuation
data_train['text'] = data_train['text'].str.replace('[^\w\s]','')
data_train['text'].head()

In [None]:
#removing stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
data_train['text'] = data_train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data_train['text'].head()

In [None]:
#removing common word
freq = pd.Series(' '.join(data_train['text']).split()).value_counts()[:10]
freq

In [None]:
#removing common word
freq = list(freq.index)
data_train['text'] = data_train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
data_train['text'].head()

In [None]:
#remvoing rare words
freq = pd.Series(' '.join(data_train['text']).split()).value_counts()[-10:]
freq

In [None]:
#removing rare words
freq = list(freq.index)
data_train['text'] = data_train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
data_train['text'].head()

In [None]:
#spelling correction
from textblob import TextBlob
data_train['text'][:5].apply(lambda x: str(TextBlob(x).correct()))

In [None]:
#tokenization
TextBlob(data_train['text'][1]).words

In [None]:
#stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
data_train['text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [None]:
#lemmetization
from textblob import Word
data_train['text'] = data_train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data_train['text'].head()

**Advance Text Processing**

In [None]:
#N-grams
TextBlob(data_train['text'][0]).ngrams(2)

In [None]:
#Term frequency
tf1 = (data_train['text'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

In [None]:
#inverse document frequency
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(data_train.shape[0]/(len(data_train[data_train['text'].str.contains(word)])))

tf1

In [None]:
#term freq - inverse document freq
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

In [None]:
#sparse matrix tf-idf freq
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(data_train['text'])

train_vect

In [None]:
#Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(data_train['text'])
train_bow

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data_train.head(10)

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x = data_train['text'].astype(str)
y = data_train['deceptive']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=0.25)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
x_test.head()

In [None]:
y_test.head()

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [None]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(x_train)
sequences = tok.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(x_test)
sequences_test = tok.texts_to_sequences(x_test)
sequences_matrix_test = sequence.pad_sequences(sequences_test,maxlen=max_len)

In [None]:
tokenizer = Tokenizer(num_words=None,lower=True,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',split=' ',char_level=False)
tokenizer.fit_on_texts(x_train)

In [None]:
tokenizer.fit_on_texts(x_test)

In [None]:
x_train1 = tokenizer.texts_to_sequences(x_train)

In [None]:
x_test1=tokenizer.texts_to_sequences(x_test)

In [None]:
word_index = tokenizer.word_index

In [None]:
vocab_size = len(word_index)
print('Vocab size: {}'.format(vocab_size))
longest = max(len(seq) for seq in x_train)
print("Longest comment size: {}".format(longest))
average = np.mean([len(seq) for seq in x_train])
print("Average comment size: {}".format(average))
stdev = np.std([len(seq) for seq in x_train])
print("Stdev of comment size: {}".format(stdev))
max_len = int(average + stdev * 3)
print('Max comment size: {}'.format(max_len))

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
processed_x_train = pad_sequences(x_train1, maxlen=max_len, padding='post', truncating='post')
processed_x_test = pad_sequences(x_test1, maxlen=max_len, padding='post', truncating='post')

In [None]:
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop,Nadam
from keras.callbacks import EarlyStopping

In [None]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
model.fit(processed_x_train,y_train,batch_size=128,epochs=10,
          validation_data=(processed_x_test,y_test),callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
import keras.backend
from keras.models import Sequential, load_model
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dense
from keras.layers import Flatten

In [None]:
from keras.layers import CuDNNGRU, Dense, Conv1D, MaxPooling1D
from keras.layers import Dropout, GlobalMaxPooling1D, BatchNormalization, LSTM
from keras.layers import Bidirectional

Embeddings ------- GloVe 100D ------

In [None]:
embeddings_index = {}
f = open(os.path.join('../input/glove-global-vectors-for-word-representation', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_dim = 100
k = 0
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        k += 1
        embedding_matrix[i] = embedding_vector

In [None]:
## create model
model_glove = Sequential()
model_glove.add(Embedding(vocab_size + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
#model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_glove.summary()
model_glove.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model_glove.fit(processed_x_train,y_train,batch_size=128,epochs=10,
          validation_data=(processed_x_test,y_test),callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

LSTM Embeddings

In [None]:
# Initate model
model3 = Sequential()

# Add Embedding layer
model3.add(Embedding(vocab_size + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True))

# Add Recurrent layer
#model.add(Bidirectional(CuDNNGRU(300, return_sequences=True)))
model3.add(LSTM(60, return_sequences=True, name='lstm_layer'))
model3.add(LSTM(30, return_sequences=True, name='lstm_layer2'))
model3.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model3.add(MaxPooling1D(3))
model3.add(GlobalMaxPooling1D())
model3.add(BatchNormalization())

# Add fully connected layers
model3.add(Dense(50, activation='relu'))
model3.add(Dropout(0.3))
model3.add(Dense(1, activation='sigmoid'))

# Summarize the model
model3.summary()

**CNN GloVe Model 2**

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') <br>
embedded_sequences = embedding_layer(sequence_input)<br>
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)<br>
l_pool1 = MaxPooling1D(5)(l_cov1)<br>
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)<br>
l_pool2 = MaxPooling1D(5)(l_cov2)<br>
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)<br>
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling<br>
l_flat = Flatten()(l_pool3)<br>
l_dense = Dense(128, activation='relu')(l_flat)<br>
preds = Dense(2, activation='softmax')<br>