In [1]:
from w2v_utils import *
import pandas as pd 
import os
from bs4 import BeautifulSoup 
# The package comes built-in with Python
import re
# http://www.nltk.org/install.html
import nltk
from nltk.corpus import stopwords # Import the stop word list

Using TensorFlow backend.


In [2]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import plot_model

## Functions

In [3]:
def text_to_words(raw_text):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_text, "lxml").get_text() 

    # 2. Remove non-letters
    # Find anything that is NOT a lowercase letter (a-z) or an upper case letter (A-Z), and replace it with a space
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    # stops = set(stopwords.words("english"))
    
    # There are many other things we could do to the data
    # For example, Porter Stemming and Lemmatizing (both available in NLTK) 
    # would allow us to treat "messages", "message", and "messaging" as the same word
    # which could certainly be useful.
    
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    clear_text = " ".join(meaningful_words)
    return clear_text
    #validate_text_sentiment["Message"][i] = clear_text 
    #print i
    #return( " ".join( meaningful_words ))  

In [4]:
# GRADED FUNCTION: sentence_to_avg

def sentence_to_avg(sentence, word_to_vec_map):
    """
    Converts a sentence (string) into a list of words (strings). Extracts the GloVe representation of each word
    and averages its value into a single vector encoding the meaning of the sentence.
    
    Arguments:
    sentence -- string, one training example from X
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    
    Returns:
    avg -- average vector encoding information about the sentence, numpy-array of shape (50,)
    """
    
    ### START CODE HERE ###
    # Step 1: Split sentence into list of lower case words (≈ 1 line)
    words = [i.lower() for i in sentence.split()]
    
    # Remove unknown words
    cleaned_words = []
    for word in words:
        if word not in errors:
            cleaned_words.append(word)
            
    words = cleaned_words
        
    # Initialize the average word vector, should have the same shape as your word vectors.
    avg = np.zeros((50,))
    
    # Step 2: average the word vectors. You can loop over the words in the list "words".
    for w in words:
        avg += word_to_vec_map[w]
    avg = avg / len(words)
    
    ### END CODE HERE ###
    
    return avg

In [5]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding='utf8') as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map

## Import data

In [6]:
df_train = pd.read_csv("Data/task_train.csv")

In [7]:
df_test = pd.read_csv("Data/task_test.csv")

In [8]:
words, word_to_vec_map = read_glove_vecs('Data/glove.6B.50d.txt')

## Clean and preprocess

In [13]:
stops = set(stopwords.words("english"))

In [14]:
df_train['cleaned_text'] = df_train.apply (lambda row: text_to_words(row.review), axis=1)

In [15]:
df_train = df_test

In [16]:
merged = []
for review in df_train['cleaned_text']:
    merged = merged + review.split()

In [17]:
len(merged)

303347

In [18]:
vocabulary = np.unique(np.array(merged))

In [19]:
errors = []

for word in vocabulary:
    try:
        word_to_vec_map[word]
    except:
        errors.append(word)

In [20]:
len(errors)

1844

In [21]:
data = []
for review in df_train['cleaned_text']:
    data.append(sentence_to_avg(review, word_to_vec_map))

In [134]:
X = np.array(data)
y = df_train['sentiment']
y = to_categorical(y)

In [135]:
y

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [138]:
X_train.shape

(7860, 50)

In [139]:
y_train.shape

(7860, 2)

## Model

In [26]:
model = Sequential()
model.add(Dense(32, input_dim=50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [27]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [36]:
model.fit(X_train, y_train, epochs=100, batch_size=512, validation_split=0.20)

Train on 6288 samples, validate on 1572 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1a24f2e4a8>

In [63]:
model.save_weights('model2.h5')

In [38]:
#get TRAINING loss and acc
print("Evaluating training data")
print("====================================================================")
train_score = model.evaluate(X_train, y_train)
print("Test accuracy = ", train_score[1]) 
print("====================================================================")

print()
#get TEST loss and acc
print("Evaluating testing data")
print("====================================================================")
test_score = model.evaluate(X_test, y_test)
Y_test_hat = np.argmax(model.predict(X_test),axis=1)
print("Test accuracy = ", test_score[1]) 
print("====================================================================")

Evaluating training data
Test accuracy =  0.795292620865

Evaluating testing data
Test accuracy =  0.763987792533


In [39]:
from sklearn.metrics import f1_score
print('F1-score of the algorithm is %f' %(f1_score(np.argmax(y_test,axis=1), Y_test_hat)))

F1-score of the algorithm is 0.762781


In [110]:
Y_test_hat = np.argmax(model.predict(X),axis=1)

In [111]:
Y_test_hat

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [113]:
df_test['prediction'] = Y_test_hat

In [118]:
results = df_train[['id','prediction']]

In [120]:
results.to_csv('temp_team.csv')

In [124]:
df_train[['id','review','cleaned_text']].to_csv('cleaned_training.csv')

In [6]:
plot_model(model, to_file="model.png", show_shapes=True)

In [22]:
import pickle
f = open("Data/train_test_data.pkl", 'rb')

In [23]:
data = pickle.load(f)

In [35]:
X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]
y_test = data["y_test"]

## TextBlob

In [40]:
from textblob import TextBlob

In [56]:
def get_polarity(review):
    polarity = TextBlob(review).sentiment.polarity
    if polarity > 0:
        return 1
    else:
        return 0

In [69]:
polarity_pred = analyzer.polarity_scores(cleaned_train['review'][3])

In [81]:
polarity_pred_blob = [get_polarity(review) for review in cleaned_train['review']]

In [80]:
y_pred_blob = polarity_pred

In [70]:
y_true = cleaned_train['sentiment']

In [71]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, polarity_pred)

0.69675834191963226

In [61]:
y_true.shape

(7860,)

In [63]:
df_train.keys()

Index(['id', 'review'], dtype='object')

In [68]:
cleaned_train = pd.read_csv('Data/cleaned_train.csv')

## Vader sentiment

In [73]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [79]:
analyzer = SentimentIntensityAnalyzer()

{'compound': 0.9454, 'neg': 0.061, 'neu': 0.776, 'pos': 0.163}

In [82]:
def get_compound(review):
    compound = analyzer.polarity_scores(review)['compound']
    if compound > 0:
        return 1
    else:
        return 0

In [83]:
polarity_pred_analyzer = [get_compound(review) for review in cleaned_train['review']]

In [84]:
y_pred_vader = polarity_pred_analyzer

In [85]:
accuracy_score(y_true, y_pred_vader)

0.69081398113656178