In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv('corpus_lemm.csv')
df.head()

Unnamed: 0,text,fraudulent
0,marketing intern u ny new york food created gr...,0.0
1,customer service cloud video production nz auc...,0.0
2,commissioning machinery assistant cma u ia wev...,0.0
3,account executive washington dc u dc washingto...,0.0
4,bill review manager u fl fort worth spotsource...,0.0


In [3]:
df.duplicated().sum()

419

In [4]:
df.isnull().any()

text          False
fraudulent     True
dtype: bool

In [5]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df.head()
df.reset_index(inplace=True, drop=True)

In [6]:
import gensim

# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=500000)



In [7]:
max_len = 0
idx = 0
for i,text in enumerate(df['text']):
    if(max_len<len(text)):
        max_len = len(text)
        idx = i

In [8]:
idx

4578

In [9]:
max_len_sent = len(df['text'][idx].split())

In [10]:
max_len_sent

1428

In [11]:
vocab = model.index_to_key
key_to_idx = model.key_to_index
vocab_size = len(vocab) + 1
print("Vocabulary size is: ", vocab_size)

Vocabulary size is:  500001


In [12]:
import numpy as np
import tensorflow
np.random.seed(0)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
np.random.seed(1)

In [13]:
def sentences_to_indices(X, model, max_len):

    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in key_to_idx:
                X_indices[i, j] = key_to_idx[w]
                # Increment j to j + 1
                j += 1
            else:
                j+=1
            
    ### END CODE HERE ###
    
    return X_indices

In [14]:
def pretrained_embedding_layer(model, key_to_idx):
    vocab = model.index_to_key
    vocab_len = len(vocab) + 1  # adding 1 to fit Keras embedding (requirement)
                
    emb_dim = model["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)

    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in key_to_idx.items():
        emb_matrix[index, :] = model[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [15]:
def LSTM_model(input_shape, model, key_to_idx):

    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(model, key_to_idx)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.1)(X)
    # Propagate X trough another LSTM layer with 64-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.1)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(1)(X)
    # Add a softmax activation
    X = Activation('tanh')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
    
    ### END CODE HERE ###
    
    return model

In [16]:
Model = LSTM_model((max_len_sent,), model, key_to_idx)
Model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1428)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1428, 300)         150000300 
_________________________________________________________________
lstm (LSTM)                  (None, 1428, 128)         219648    
_________________________________________________________________
dropout (Dropout)            (None, 1428, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129   

In [17]:
from tensorflow.keras.optimizers import Adam

opt = Adam(clipvalue=1, learning_rate=0.01)

Model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [18]:
#from sklearn.utils import shuffle
#df = shuffle(df, random_state=51)
#df.reset_index(drop=True, inplace=True)

In [19]:
#part = int(df.shape[0]*0.75)

#train_df = df.loc[0:part]
#train_df.reset_index(drop=True, inplace=True)
#train_df.to_csv('corpus_lemm_train.csv', index=False)
#ytrain = df.loc[0:part, 'fraudulent']
#test_df = df.loc[part:]
#test_df.reset_index(drop=True, inplace=True)
#test_df.to_csv('corpus_lemm_test.csv', index=False)
#ytest = df.loc[part: , 'fraudulent']

In [20]:
train_df = pd.read_csv('corpus_lemm_train.csv')
xtrain = train_df['text']
ytrain = train_df['fraudulent']

test_df = pd.read_csv('corpus_lemm_test.csv')
xtest = test_df['text']
ytest = test_df['fraudulent']

In [21]:
X_train_indices = sentences_to_indices(xtrain, key_to_idx, max_len_sent)
Y_train_oh = ytrain

In [22]:
Model.fit(X_train_indices, Y_train_oh, epochs = 2, batch_size = 128, shuffle=True)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x2b1bd64db50>

In [23]:
Model.save('./MyModel_lem_long',save_format='tf')



INFO:tensorflow:Assets written to: ./MyModel_lem_long\assets


INFO:tensorflow:Assets written to: ./MyModel_lem_long\assets


In [14]:
import tensorflow as tf

loaded_model = tf.keras.models.load_model('MyModel_tf1')

In [24]:
xtest.reset_index(drop=True, inplace=True)
X_test_indices = sentences_to_indices(xtest, key_to_idx, max_len = max_len_sent)
loss, acc = Model.evaluate(X_test_indices, ytest, batch_size=128)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.9710773825645447
