In [1]:
import numpy as np
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk import word_tokenize, ngrams
from nltk.classify import SklearnClassifier
from wordcloud import WordCloud,STOPWORDS

In [2]:
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, merge, LSTM, Lambda, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import backend as K
import codecs

Using Theano backend.


In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
# Target Mapping
mapping_target = {'happy':0, 'not happy':1}
train = train.replace({'Is_Response':mapping_target})

# Browser Mapping
mapping_browser = {'Firefox':0, 'Mozilla':0, 'Mozilla Firefox':0,
                  'Edge': 1, 'Internet Explorer': 1 , 'InternetExplorer': 1, 'IE':1,
                   'Google Chrome':2, 'Chrome':2,
                   'Safari': 3, 'Opera': 4
                  }
train = train.replace({'Browser_Used':mapping_browser})
test = test.replace({'Browser_Used':mapping_browser})
# Device mapping
mapping_device = {'Desktop':0, 'Mobile':1, 'Tablet':2}
train = train.replace({'Device_Used':mapping_device})
test = test.replace({'Device_Used':mapping_device})

In [5]:
GLOVE_DIR = 'glove/'
MAX_SEQUENCE_LENGTH = 300
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.01

In [6]:
test_id = test['User_ID']
target = train['Is_Response']

In [9]:
# function to clean data
import string
import itertools 
import re
from nltk.stem import WordNetLemmatizer
stops = set(stopwords.words("english"))
# punct = list(string.punctuation)
# punct.append("''")
# punct.append(":")
# punct.append("...")
# punct.append("@")
# punct.append('""')
def cleanData(text, lowercase = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")
    
#     APPOSTOPHES = {"'s" : " is", "'re" : " are", "'m": " am", "'d": " would", "'ll": " will"}
#     words = txt.split()
#     reformed = [APPOSTOPHES[word] if word in APPOSTOPHES else word for word in words]
#     txt = " ".join(reformed)

#     # Emoji replacement
    txt = re.sub(r':\)',r' Happy ',txt)
    txt = re.sub(r':D',r' Happy ',txt)
    txt = re.sub(r':P',r' Happy ',txt)
    txt = re.sub(r':\(',r' Sad ',txt)
    
    # Remove urls and emails
    txt = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', txt, flags=re.MULTILINE)
    txt = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', txt, flags=re.MULTILINE)
    
    # Remove punctuation
#     txt = "".join(ch for ch in txt if ch not in punct)
    txt = txt.replace(".", " ")
    txt = txt.replace(":", " ")
    txt = txt.replace("!", " ")
    txt = txt.replace("&", " ")
    txt = txt.replace("#", " ")
    
    # Remove all symbols
    txt = re.sub(r'[^A-Za-z0-9\s]',r' ',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    txt = re.sub(r'[0-9]',r' ',txt)
    
    # Replace words like sooooooo with so
    txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    
    # Split attached words
    #txt = " ".join(re.findall('[A-Z][^A-Z]*', txt))   
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    if stemming:
        st = PorterStemmer()
#         print (len(txt.split()))
#         print (txt)
        txt = " ".join([st.stem(w) for w in txt.split()])
    
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in txt.split()])

    return txt

# string = str("I read thNe reviews, saw the location and price and decided that booking a room here was a good idea. My advice:don't do it people, just don't. Go for a brand name hotel, and pay a little more. That way, if something goes horribly wrong, you may have a shot at getting some kind of resolution.This is a true story, swear to God: my husband went to wipe himself off with a towel-twice while taking a shower and both of the towels had fe..s on them. Yes, fe..s. He flipped out and took the towels to the front desk. The manager was not there, and upon checking out, I explained to the girl at the front desk that not only was this appalling, but dangerous. Particularly to my one month old baby. I also told her, that I did not feel like I should pay for our stay there. I did pay though (Dumb. In retrospect, I should not have), because I just thought I would settle this with the manager later. My thought was the manager would listen to what happened to us, and act appropriately. That didn't happen. I emailed """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""Mike"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""", the manager, then followed up with a phone call. He basically called m a liar, said that """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""never in my career have I ever heard of something like this"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""", to which I replied, """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""Well I haven't heard of anything like this either in all my years of traveling, but this really did happen."""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" He then proceeded to give me the third degree about where the towels were located, when this happened blah blah, etc. It was very apparent that he felt that this was some creative attempt on my family's part to get a refund. God, what a jerk. I am POed all over again just writing this. I asked for a refund, he said no, then I told him that I would need to file a complaint with the BBB. He then said, """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""Go ahead! I don't care, you can threaten me all you want!"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Uh, no Mike this isn't personal, just horrendous customer service. Like, on very level imaginable. Whatever. I told him, fine. If this is the way you choose to handle this. I filed a complaint w-the BBB. come to think of it, I think I'll contact The Health Department too.")
# print (cleanData(string, lowercase=True, remove_stops=True, stemming=False, lemmatization = True))

# print (cleanData(string, lowercase=True, remove_stops=True, stemming=True, lemmatization = False))

In [10]:
# clean description
train['Description'] = train['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=False, stemming=False, lemmatization = True))
test['Description'] = test['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=False, stemming=False, lemmatization = True))

In [11]:
print('Indexing word vectors.')
embeddings_index = {}
f = codecs.open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [12]:
print('Processing text dataset')
texts_1 = []
for text in train['Description']:
    texts_1.append(text)

labels = train['Is_Response']  # list of label ids

print('Found %s texts.' % len(texts_1))
test_texts_1 = []
for text in test['Description']:
    test_texts_1.append(text)
print('Found %s texts.' % len(test_texts_1))

Processing text dataset
Found 38932 texts.
Found 29404 texts.


In [13]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + test_texts_1)
sequences_1 = tokenizer.texts_to_sequences(texts_1)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
#test_labels = np.array(test_labels)
del test_sequences_1
del sequences_1
import gc
gc.collect()

Found 51296 unique tokens.
Shape of data tensor: (38932, 300)
Shape of label tensor: (38932,)


0

In [14]:
print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index)) + 1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix.
Null word embeddings: 19309


In [15]:
embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [20]:
# Model Architecture #
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
DROPOUT = 0.2
x1 = Conv1D(128, 3, activation='relu')(embedded_sequences_1)
x1 = MaxPooling1D(10)(x1)
# x1 = LSTM(128, return_sequences=True)(embedded_sequences_1)
x1 = Flatten()(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.2)(x1)

# merged = Dense(200, activation='relu')(x1)
# merged = Dropout(DROPOUT)(merged)
# merged = BatchNormalization()(merged)
# merged = Dense(200, activation='relu')(merged)
# merged = Dropout(DROPOUT)(merged)
# merged = BatchNormalization()(merged)
# merged = Dense(200, activation='relu')(merged)
# merged = Dropout(DROPOUT)(merged)
# merged = BatchNormalization()(merged)
merged = Dense(64, activation='relu')(x1)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
preds = Dense(2, activation='softmax')(merged)
model = Model(input=sequence_1_input, output=preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])



In [None]:
model.fit(data_1, to_categorical(labels), validation_split=VALIDATION_SPLIT, nb_epoch=25, batch_size=516, shuffle=True)

  if __name__ == '__main__':


Train on 38542 samples, validate on 390 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25

In [22]:
pred = model.predict(test_data_1)
#print(pred.shape)

In [23]:
preds = []

for i in pred:
    if i[0] >= i[1]:
        preds.append('happy')
    else:
        preds.append('not_happy')

In [24]:
result = pd.DataFrame()
result['User_ID'] = test_id
result['Is_Response'] = preds
mapping = {0:'happy', 1:'not_happy'}
result = result.replace({'Is_Response':mapping})

result.to_csv("nn_predicted_result_1.csv", index=False)