In [8]:
from numpy import random
N = 500
d = 300
C = 5
W = random.rand(C,d)
W_list = [random.rand(d,1) for i in range(N)]
W_list

[array([[  3.81215752e-01],
        [  5.06927594e-01],
        [  6.64306701e-02],
        [  6.86955932e-01],
        [  3.08407895e-01],
        [  8.91315670e-01],
        [  6.77163172e-01],
        [  3.85422475e-01],
        [  6.60168204e-01],
        [  1.75715463e-01],
        [  6.56663468e-01],
        [  3.17420490e-01],
        [  1.15640412e-01],
        [  2.93983164e-02],
        [  1.93488046e-02],
        [  1.82568112e-01],
        [  6.26052783e-02],
        [  3.87794274e-01],
        [  2.08543104e-01],
        [  6.04409300e-01],
        [  4.03450622e-01],
        [  5.77214024e-01],
        [  9.22606050e-01],
        [  5.66865749e-01],
        [  8.43554869e-01],
        [  1.68170965e-01],
        [  6.67174371e-02],
        [  1.75829379e-01],
        [  3.10137165e-01],
        [  2.63028179e-01],
        [  6.10043868e-01],
        [  3.73876638e-01],
        [  9.12263099e-01],
        [  5.59796397e-01],
        [  7.16248097e-01],
        [  1.0657535

In [11]:
len(W_list)

500

In [1]:
from __future__ import unicode_literals
import numpy as np 
import pandas as pd
from sklearn.model_selection  import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.core import Dense, Dropout, Activation, RepeatVector

def preprocess(filename):
    df = pd.read_csv(filename)
    df_backup = df
    if 'word' in df:
        df = df.drop('word', 1)
    if 'word_pos' in df:
        df = df.drop('word_pos', 1)
    if 'word_lemma' in df:
        df = df.drop('word_lemma', 1)

    label = df['label']
    features = df.drop('label',1)
    features = pd.get_dummies(features)
    
    #Reshape X into 3-dimension sequence
    i = 0
    X_dict = {}
    while i < len(features)-10:
        sequence = features.iloc[i:(i+10),:]
        X_dict[i] = sequence
        i = i + 1
    X = np.dstack(X_dict.values())
    X = np.moveaxis(X, -1,0)
    
    #Reshape Y
    label.replace(['end '], ['end'], inplace=True)
    label.replace(['start '], ['start'], inplace=True)
    label.replace(['other'], ['0'], inplace=True)
    label.replace(['start'], ['1'], inplace=True)
    label.replace(['middle'], ['2'], inplace=True)
    label.replace(['end'], ['3'], inplace=True)
    Y = label[5:-5]
    return(X,Y)

def training(X,Y):

    #Split dataset into training set and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    y_train_ = pd.get_dummies(y_train)
    y_train_np = np.array(y_train_)
    y_test_ = pd.get_dummies(y_test)
    y_test_np = np.array(y_test_)


    #Train the Bi-directional neural network
    model = Sequential()
    model.add(Bidirectional(LSTM(320, return_sequences=True), input_shape=(X_train.shape[1:])))
    model.add(Bidirectional(LSTM(320)))
    model.add(Dropout(0.2))
    model.add(Dense(4))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    model.fit(X_train, y_train_np, batch_size=512, epochs =50)

    #Validation and evaluation
    validation = pd.DataFrame(y_test)
    prediction = model.predict(X_test)
    validation['predict'] = np.argmax(prediction,axis=-1)

    SME = 0
    SME_pred = 0
    SME_pred_acc = 0
    start = 0
    start_pred = 0
    start_pred_acc = 0
    middle = 0
    middle_pred = 0
    middle_pred_acc = 0
    end = 0
    end_pred = 0
    end_pred_acc = 0

    for index, row in validation.iterrows():
        actual = row[0]
        predict = row[1]
        if int(actual) != 0:
            SME += 1
        if int(actual) == 1:
            start += 1
        if int(actual) == 2:
            middle += 1
        if int(actual) == 3:
            end += 1
        if int(predict) != 0:
            SME_pred += 1
        if int(predict) == 1:
            start_pred += 1
        if int(predict) == 2:
            middle_pred += 1
        if int(predict) == 3:
            end_pred += 1
        if int(predict) == int(actual) and int(predict) != 0:
            SME_pred_acc += 1
        if int(predict) == int(actual) and int(predict) == 1:
            start_pred_acc += 1
        if int(predict) == int(actual) and int(predict) == 2:
            middle_pred_acc += 1
        if int(predict) == int(actual) and int(predict) == 3:
            end_pred_acc += 1

    #Calculate the confusion matrix
    precision = (float(SME_pred_acc)/SME_pred)
    recall = (float(SME_pred_acc)/SME)
    f1 = 2 * precision * recall / (precision + recall)

    precision_start = (float(start_pred_acc)/start_pred)
    recall_start = (float(start_pred_acc)/start)
    f1_start = 2 * precision_start * recall_start / (precision_start + recall_start)

    precision_middle = (float(middle_pred_acc)/middle_pred)
    recall_middle = (float(middle_pred_acc)/ middle)
    f1_middle = 2 * precision_middle * recall_middle / (precision_middle + recall_middle)

    precision_end = (float(end_pred_acc)/end_pred)
    recall_end = (float(end_pred_acc)/end)
    f1_end = 2 * precision_end * recall_end / (precision_end + recall_end)



    print('Precision: ',SME_pred, SME_pred_acc, precision)
    print('Recall: ', SME, SME_pred_acc, recall)
    print('F1 Score: ', f1)
    print('Start: ', start, start_pred, start_pred_acc, 'P: ', precision_start , 'R: ', recall_start, 'F1: ', f1_start) 
    print('Middle: ', middle, middle_pred, middle_pred_acc, 'P: ',precision_middle ,  'R: ',recall_middle ,'F1: ',f1_middle)
    print('End: ', end, end_pred, end_pred_acc, 'P: ',precision_end , 'R: ',recall_end , 'F1: ', f1_end)
    
    #Display Itemname
    dictionary = df_backup

    for index, row in validation.iterrows():
        index = index+5
        actual = row[0]
        predict = row[1]
        if int(predict) == int(actual) and int(predict) == 1:
            word = dictionary.loc[index+1]['word']
            word2 = dictionary.loc[index+2]['word']
            word3 = dictionary.loc[index+3]['word']
            word4 = dictionary.loc[index+4]['word']
            word5 = dictionary.loc[index+5]['word']
            print(word, word2, word3, word4, index)
            
if __name__ == '__main__':
    filename = 'dataset_RNN.csv'
    X,Y = preprocess(filename)
    #training(X,Y)
    

Using TensorFlow backend.


In [5]:
X.shape[1:]

(10, 25)

In [4]:
Y.shape

(109998,)