In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [50]:
raw_data = pd.read_excel('data/compiled_Data.xlsx')
raw_data

Unnamed: 0,SID,CATEGORY,DELAY,IP,IP VALUE,CUES,STUDY
0,UB2MD201P,EFT,30,IP30,789.06,in about 1 month i am at the movies with my co...,14
1,UB2MD201P,EFT,180,IP180,554.70,in about 6 months my sister carolyn is home fr...,14
2,UB2MD201P,EFT,365,IP365,632.80,In about 1 year i am planning for christmas. ...,14
3,UB2MD207,EFT,30,IP30,351.56,In about one month I am kayaking on Lake Erie....,14
4,UB2MD207,EFT,180,IP180,117.20,In about 6 months I am at the WHO concert in C...,14
...,...,...,...,...,...,...,...
3023,159,EFT,180,IP180,58.60,,Sara
3024,159,EFT,730,IP730,35.16,,Sara
3025,160,EFT,30,IP30,46.10,,Sara
3026,160,EFT,180,IP180,24.22,,Sara


In [51]:
'''
Dropping rows with empty cues and nan IP values
'''
import math
raw_data.dropna(subset=['CUES', 'IP VALUE'], inplace=True)

In [52]:
'''
Normalizing the IP values to the nearest 10th multiple
'''
temp_IP = []
for item in raw_data['IP VALUE']:
    temp_val = 0
    if item > 100:
        temp_val = math.ceil(item/100)*10
    else:
        temp_val = math.ceil(item/10)*10
    temp_IP.append(temp_val)
raw_data['IP VALUE'] = temp_IP

In [53]:
# raw_data = raw_data.loc[raw_data['IP VALUE'] < 100]
raw_data

Unnamed: 0,SID,CATEGORY,DELAY,IP,IP VALUE,CUES,STUDY
0,UB2MD201P,EFT,30,IP30,80,in about 1 month i am at the movies with my co...,14
1,UB2MD201P,EFT,180,IP180,60,in about 6 months my sister carolyn is home fr...,14
2,UB2MD201P,EFT,365,IP365,70,In about 1 year i am planning for christmas. ...,14
3,UB2MD207,EFT,30,IP30,40,In about one month I am kayaking on Lake Erie....,14
4,UB2MD207,EFT,180,IP180,20,In about 6 months I am at the WHO concert in C...,14
...,...,...,...,...,...,...,...
2783,79,EFT,180,IP180,30,"In about 6 months, I am enjoying New Year's we...",Sara
2784,79,EFT,730,IP730,10,"In about 2 years, I am trying to figure out wh...",Sara
2785,80,EFT,30,IP30,100,"In about 1 month, I am helping coach my son's ...",Sara
2786,80,EFT,180,IP180,90,"In about 6 months, I am enjoying a business tr...",Sara


In [54]:
'''
Splitting the data into EFT and ERT 
- Creating two new dataframes with the respective data.
'''

eft_raw_data = raw_data[raw_data['CATEGORY']=='EFT']
ert_raw_data = raw_data[raw_data['CATEGORY']=='ERT']

In [55]:
eft_raw_data

Unnamed: 0,SID,CATEGORY,DELAY,IP,IP VALUE,CUES,STUDY
0,UB2MD201P,EFT,30,IP30,80,in about 1 month i am at the movies with my co...,14
1,UB2MD201P,EFT,180,IP180,60,in about 6 months my sister carolyn is home fr...,14
2,UB2MD201P,EFT,365,IP365,70,In about 1 year i am planning for christmas. ...,14
3,UB2MD207,EFT,30,IP30,40,In about one month I am kayaking on Lake Erie....,14
4,UB2MD207,EFT,180,IP180,20,In about 6 months I am at the WHO concert in C...,14
...,...,...,...,...,...,...,...
2783,79,EFT,180,IP180,30,"In about 6 months, I am enjoying New Year's we...",Sara
2784,79,EFT,730,IP730,10,"In about 2 years, I am trying to figure out wh...",Sara
2785,80,EFT,30,IP30,100,"In about 1 month, I am helping coach my son's ...",Sara
2786,80,EFT,180,IP180,90,"In about 6 months, I am enjoying a business tr...",Sara


In [56]:
ert_raw_data

Unnamed: 0,SID,CATEGORY,DELAY,IP,IP VALUE,CUES,STUDY
123,UB2MD202P,ERT,1,IP1,0,As I was on the 5 Niagara bus headed towards U...,14
124,UB2MD202P,ERT,6,IP6,0,"It was a decent day outside, and ever since I ...",14
125,UB2MD202P,ERT,12,IP12,0,"I was at home that day, and I had no prior eng...",14
126,UB2MD205,ERT,1,IP1,0,About 24 hours ago I was conversing with frien...,14
127,UB2MD205,ERT,6,IP6,0,About 144 hours ago I was at a Sabres game wit...,14
...,...,...,...,...,...,...,...
2543,16970,ERT,180,IP180,10,About 2 days ago I was invited to a snowmachin...,20
2544,16970,ERT,180,IP180,10,About 3 days ago I was invited to a whisky pla...,20
2545,13301,ERT,180,IP180,100,"About 24 hours ago, I was at home with my frie...",20
2546,13301,ERT,180,IP180,100,"About 2 days ago, I was on the plan home from ...",20


In [57]:
'''
Rename the IP VALUE column, to replace the space with a _
'''
eft_raw_data = eft_raw_data.rename(columns = {'IP VALUE':'IP_VALUE'})
ert_raw_data = ert_raw_data.rename(columns = {'IP VALUE':'IP_VALUE'})

In [58]:
'''
Sort the dataframes by the DELAY periods. 
'''
eft_raw_data = eft_raw_data.sort_values(by=['DELAY'])
eft_raw_data = eft_raw_data.reset_index(drop=True)

ert_raw_data = ert_raw_data.sort_values(by=['DELAY'])
ert_raw_data = ert_raw_data.reset_index(drop=True)

In [59]:
'''
Remove the rows with the IP_VALUES = 0, as they hold no value
'''
ert_raw_data = ert_raw_data[ert_raw_data.IP_VALUE != 0]

In [60]:
ert_raw_data

Unnamed: 0,SID,CATEGORY,DELAY,IP,IP_VALUE,CUES,STUDY
105,120 MS1120,ERT,30,IP30,90,"About 24 hours ago, I was watching the latest ...",16
106,119 MS1119,ERT,30,IP30,100,About 24 hours ago I was at one of my favorite...,16
107,2526-084,ERT,30,IP30,100,"About 24 hours ago, I was celebrating my frien...",19
108,2526-86,ERT,30,IP30,100,About 24 hours ago I was watching Riverdale. I...,19
109,118 MS1118,ERT,30,IP30,90,About 24 hours ago I was at a restaurant celeb...,16
...,...,...,...,...,...,...,...
763,136 MS1136,ERT,365,IP365,60,"About 288 hours ago, I was at my friend's hous...",16
764,137 MS1137,ERT,365,IP365,70,About 288 hours ago I was at Katie's house. I ...,16
765,138 MS1138,ERT,365,IP365,30,About 288 hours ago I was at work. We have an ...,16
766,124 MS1124,ERT,365,IP365,10,About 288 hours ago I walked the dog. I walked...,16


In [61]:
eft_raw_data

Unnamed: 0,SID,CATEGORY,DELAY,IP,IP_VALUE,CUES,STUDY
0,UB2MD201P,EFT,30,IP30,80,in about 1 month i am at the movies with my co...,14
1,66 MS1066,EFT,30,IP30,100,In about 1 month I am finished with my two spe...,16
2,65 MS1065,EFT,30,IP30,90,In about 1 month I am going to be celebrating ...,16
3,64 MS1064,EFT,30,IP30,90,In about 1 month I am out to dinner with my fr...,16
4,63 MS1063,EFT,30,IP30,90,Within the next month I am moving into a diffe...,16
...,...,...,...,...,...,...,...
1753,16,EFT,9125,IP9125,10,"In about 25 years, I am at Berglund Chevy buyi...",15
1754,61,EFT,9125,IP9125,30,"In about 25 years, I am retired and relaxing. ...",15
1755,17,EFT,9125,IP9125,10,"In about 25 years, I am at Las Vegas with my h...",15
1756,11,EFT,9125,IP9125,10,"In about 25 years, I am happy that I own Ganse...",15


In [62]:
'''
Get the unique delay periods for both EFT and ERT
'''
eft_delays = eft_raw_data.DELAY.unique()
ert_delays = ert_raw_data.DELAY.unique()

In [63]:
print('Unique delay periods for EFT: ',eft_delays, '\nUnique delay periods for ERT: ', ert_delays)

Unique delay periods for EFT:  [  30   90  180  365  730 1825 9125] 
Unique delay periods for ERT:  [ 30 180 365]


In [64]:
'''
Make two new dictionaries for EFT and ERT - stores the dataframes with 
IP values and cues by dropping the other columns saved in drop_columns
for every delay period, which is the key of the dictionary, 
and reset the index of each dataframe.
'''
ert_ip_cue_dict = {}
eft_ip_cue_dict = {}
drop_columns = ['SID', 'CATEGORY', 'DELAY', 'IP', 'STUDY']

In [65]:
for delay in eft_delays:
    eft_ip_cue_dict[str(delay)] = eft_raw_data.loc[eft_raw_data['DELAY'] == delay]
    eft_ip_cue_dict[str(delay)] = eft_ip_cue_dict[str(delay)].drop(drop_columns, axis=1)
    eft_ip_cue_dict[str(delay)] = eft_ip_cue_dict[str(delay)].reset_index(drop=True)
    
for delay in ert_delays:
    ert_ip_cue_dict[str(delay)] = ert_raw_data.loc[ert_raw_data['DELAY'] == delay]
    ert_ip_cue_dict[str(delay)] = ert_ip_cue_dict[str(delay)].drop(drop_columns, axis=1)
    ert_ip_cue_dict[str(delay)] = ert_ip_cue_dict[str(delay)].reset_index(drop=True)

## Kaggle - LSTM code

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


In [67]:
sample = eft_ip_cue_dict['30']
sample

Unnamed: 0,IP_VALUE,CUES
0,80,in about 1 month i am at the movies with my co...
1,100,In about 1 month I am finished with my two spe...
2,90,In about 1 month I am going to be celebrating ...
3,90,In about 1 month I am out to dinner with my fr...
4,90,Within the next month I am moving into a diffe...
...,...,...
336,90,in about 1 month it is my birthday and I'm at ...
337,90,"In about 1 month, I am visiting my sister in c..."
338,100,In 1 month I am at Cedar Point. I am going wi...
339,100,In about a mouth I will welcoming my oldest so...


In [68]:
sample['CUES'] = sample['CUES'].apply(lambda x: x.lower())
sample['CUES'] = sample['CUES'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [70]:
for idx, row in sample.iterrows():
    row[1] = row[1].replace('rt', ' ')


In [82]:
max_features = 2000
tokenizer = Tokenizer(num_words = max_features, split = ' ')
tokenizer.fit_on_texts(sample['CUES'].values)
X = tokenizer.texts_to_sequences(sample['CUES'].values)
X = pad_sequences(X)

In [84]:
for i in range(0,101,10):
    print(i,' : ', sample[sample['IP_VALUE'] == i].size)

0  :  0
10  :  10
20  :  10
30  :  32
40  :  24
50  :  84
60  :  8
70  :  24
80  :  52
90  :  102
100  :  336


In [86]:
'''
Defining the LSTM model
'''

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 249, 128)          256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 249, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
Y = pd.get_dummies(sample['IP_VALUE']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

### Trying LSTM analysis - Medium code

In [18]:
sample = eft_ip_cue_dict['30']

sample['CUES'] = [item.lower() for item in sample['CUES']]
sample

Unnamed: 0,IP_VALUE,CUES
0,790,in about 1 month i am at the movies with my co...
1,100,in about 1 month i am finished with my two spe...
2,90,in about 1 month i am going to be celebrating ...
3,90,in about 1 month i am out to dinner with my fr...
4,90,within the next month i am moving into a diffe...
...,...,...
336,810,in about 1 month it is my birthday and i'm at ...
337,90,"in about 1 month, i am visiting my sister in c..."
338,100,in 1 month i am at cedar point. i am going wi...
339,980,in about a mouth i will welcoming my oldest so...


In [19]:
'''
Remove punctuations from the text
Do it by keeping a temporary cues buffer, changing it and reassigning
the cues to the sample dataframe
'''
temp_cues = sample['CUES']
all_cues = []
from string import punctuation
for item in temp_cues:
    item = re.sub(r'[^\w\s]','',item)
    all_cues.append(item)
sample['CUES'] = temp_cues

In [20]:
'''
Tokenize - Create Vocab to Int mapping directory
'''
from collections import Counter
all_text = ' '.join(all_cues)

# Create a list of words
words = all_text.split()

# Count all the words using Counter Method
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)

In [21]:
print(sorted_words)

[('i', 1020), ('the', 972), ('am', 808), ('and', 777), ('in', 618), ('my', 567), ('to', 469), ('we', 408), ('are', 395), ('a', 392), ('with', 386), ('about', 367), ('at', 353), ('month', 258), ('is', 225), ('of', 225), ('1', 194), ('for', 186), ('on', 164), ('be', 112), ('will', 105), ('it', 102), ('going', 95), ('family', 94), ('feeling', 90), ('excited', 89), ('our', 87), ('friends', 82), ('having', 82), ('enjoying', 82), ('have', 80), ('all', 79), ('that', 75), ('time', 74), ('her', 71), ('happy', 70), ('months', 65), ('husband', 61), ('as', 61), ('so', 55), ('out', 53), ('one', 52), ('this', 52), ('new', 51), ('see', 50), ('day', 50), ('dinner', 49), ('very', 49), ('watching', 49), ('me', 49), ('beach', 49), ('there', 47), ('from', 44), ('go', 44), ('birthday', 43), ('get', 42), ('up', 38), ('great', 38), ('by', 38), ('because', 38), ('good', 37), ('sitting', 37), ('fun', 37), ('enjoy', 35), ('favorite', 35), ('house', 35), ('eating', 34), ('together', 34), ('his', 34), ('park', 34

In [22]:
'''
Creating a vocab to int mapping dictionary
'''
vocab_to_int = {w: i+1 for i,(w,c) in enumerate(sorted_words)}

In [23]:
'''
Tokenizing - Encoding the words
'''
cues_int = []
for cue in all_cues:
    r = [vocab_to_int[w] for w in cue.split()]
    cues_int.append(r)

In [24]:
'''
Padding and Truncating the remaining data
'''
def pad_features(cues_int, seq_len):

    # Return features of cues_int, where each cue is padded with 0's or 
    # truncated to the input seq_len.
    features = np.zeros((len(cues_int), seq_len), dtype = int)
    
    for i, cue in enumerate(cues_int):
        cue_len = len(cue)
        
        if cue_len <= seq_length:
            zeroes = list(np.zeros(seq_len - cue_len))
            new = zeroes + cue
        elif cue_len > seq_len:
            new = cue[0 : seq_len]
        
        features[i,:] = np.array(new)
    
    return features

In [31]:
'''
Get features and labels for the data
'''
seq_length = 200
features = pad_features(cues_int, seq_length)
labels = sample['IP_VALUE']

In [36]:
'''
Make the training, testing and validation data sets
'''
train_size = 0.2
test_size = 0.1
val_size = 0.1

X_train = np.array( features[0:int(train_size*len(features))] )
y_train = np.array( labels[0:int(train_size*len(labels))] )

X_test = np.array( features[0:int(test_size*len(features))] )
y_test = np.array( labels[0:int(test_size*len(labels))] )

X_val = np.array( features[0:int(val_size*len(features))] )
y_val = np.array( labels[0:int(val_size*len(labels))] )

In [37]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [38]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   0,    0,    0,  ..., 1540,  295,  873],
        [   0,    0,    0,  ...,   49,    2,  341],
        [   0,    0,    0,  ...,  456,   16,   35],
        ...,
        [   0,    0,    0,  ...,   11,   69, 1505],
        [   0,    0,    0,  ...,  367,   18,  221],
        [   0,    0,    0,  ...,  815,    4,  208]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([ 80,  90,  90,  50, 100, 100, 100,  90,  90,  90,  50,  50, 100,  90,
         40,  90,  80, 100, 100,  60,  40, 100, 790,  60,  80,  50, 100, 100,
        100,  90, 100,  30,  90,  80, 100, 100,  70, 100, 100,  90,  90,  90,
         90,  90, 100,  50,  90, 100, 100, 100])


In [42]:
'''
Defining the LSTM network:
0. Tokenize : This is not a layer for LSTM network but a mandatory step of converting our words into tokens (integers)
1. Embedding Layer: that converts our word tokens (integers) into embedding of specific size
2. LSTM Layer: defined by hidden state dims and number of layers
3. Fully Connected Layer: that maps output of LSTM layer to a desired output size
4. Sigmoid Activation Layer: that turns all output values in a value between 0 and 1
5. Output: Sigmoid output from the last timestep is considered as the final output of this network
'''

from torch import nn
vocab_size = len(cues_int) + 1
embedding_dim = 30
embeds = nn.Embedding(vocab_size, embedding_dim)

print('Embedding layer is ', embeds)
print('Embedding layer weights ', embeds.weight.shape)

Embedding layer is  Embedding(342, 30)
Embedding layer weights  torch.Size([342, 30])


In [44]:
sample_x = sample_x.type(torch.LongTensor)
embeds_output = embeds(sample_x)

RuntimeError: index out of range at c:\n\pytorch_1559129895673\work\aten\src\th\generic/THTensorEvenMoreMath.cpp:191