In [14]:
import pandas as pd
import re
import numpy as np
import nltk
import operator
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer 
from gensim.models import Phrases,Word2Vec
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from nltk.tokenize import sent_tokenize


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten,SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D,Conv1D,MaxPooling1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers


warnings.filterwarnings('ignore')

stop = set(stopwords.words("english"))



#%conda install -c anaconda gensim
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')

In [2]:
def load_all():
    df = pd.read_csv('./imdb_master.csv',encoding="latin-1")
    df.drop(['Unnamed: 0','file','type'],axis=1,inplace=True)
    df = df[df.label != 'unsup']
    df.label = df.label.map({'pos':1,'neg':0})


    df1 = pd.read_csv('./labeledTrainData.tsv', delimiter='\t')
    df1.drop(['id'],axis=1,inplace=True)
    df1.columns = ['label','review']


    X_train = pd.concat([df, df1]).reset_index(drop=True)
    y_train = X_train.label
    X_train.drop(['label'],axis=1,inplace=True)

    X_test = pd.read_csv('./testData.tsv', delimiter='\t')
    id_test = X_test.id
    X_test.drop(['id'],axis=1,inplace=True)
    
    return X_train,X_test,y_train,id_test


In [3]:
def get_text(text):
    soup = BeautifulSoup(text,'html.parser')
    return soup.get_text()

def rm_special(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def lemma(text):
    lemmatizer = WordNetLemmatizer() 
    text = ' '.join([lemmatizer.lemmatize(word).lower() for word in text.split() if word not in stop])
    return text



In [16]:
def prepare():
    X_train['review'] = X_train['review'].apply(get_text).apply(rm_special).apply(lemma)
    X_test['review'] = X_test['review'].apply(get_text).apply(rm_special).apply(lemma)

    return X_train,X_test


In [5]:
def vectorize(X_train,X_test):
    NGRAM_RANGE = (1, 3)
    TOP_K = 20000
    TOKEN_MODE = 'word'
    MIN_DOC_FREQ = 2

    kwargs = {
            'ngram_range' : NGRAM_RANGE,
            'dtype' : 'int32',
            'strip_accents' : 'unicode',
            'decode_error' : 'replace',
            'analyzer' : TOKEN_MODE,
            'min_df' : MIN_DOC_FREQ,
        }
    tv=TfidfVectorizer(**kwargs)
    train_data = tv.fit_transform(X_train['review'])
    test_data = tv.transform(X_test['review'])
    selector = SelectKBest(f_classif, k=min(TOP_K, train_data.shape[1]))
    train_data  = selector.fit_transform(train_data, y_train)
    test_data= selector.transform(test_data)
    return train_data,test_data
#X_train,X_test = vectorize(X_train,X_test)

In [None]:
from sklearn.model_selection import cross_val_score
nb = MultinomialNB()
#cross_val_score(nb,X_train,y_train,cv=5,scoring='accuracy').mean()
nb.fit(X_train,y_train)

## MultiNB - 0.904 

In [14]:
y_pred = nb.predict(X_test)
results = pd.DataFrame({"id": id_test, "sentiment": y_pred})
results.to_csv("submission.csv", index=False) 

## Dense+ Embedding layer - 0.981

In [16]:
X_train,X_test,y_train,id_test = load_all()
X_train,X_test= prepare()
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train['review'])
list_tokenized_train = tokenizer.texts_to_sequences(X_train['review'])
max_len = max([len(s.split()) for s in X_train['review']])
X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
#model.summary()
batch_size = 100
epochs = 3
model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)


## Conv1D + Embedding layer - 0.961

In [22]:
X_train,X_test,y_train,id_test = load_all()
X_train,X_test= prepare()
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train['review'])
list_tokenized_train = tokenizer.texts_to_sequences(X_train['review'])
max_len = max([len(s.split()) for s in X_train['review']])
X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
vocab_size = len(tokenizer.word_index) + 1

In [23]:

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
batch_size = 100
epochs = 3
model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)


Train on 60000 samples, validate on 15000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f3824750890>

In [24]:
X_test_seq = tokenizer.texts_to_sequences(X_test['review'])
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
pred = model.predict(X_test_pad)
y_pred = (pred > 0.5)
y_pred = y_pred.flatten()*1
results = pd.DataFrame({"id": id_test, "sentiment": y_pred})
results.to_csv("submission.csv", index=False)

# LSTM + Embedding layer

In [None]:
X_train,X_test,y_train,id_test = load_all()
X_train,X_test= prepare()
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train['review'])
list_tokenized_train = tokenizer.texts_to_sequences(X_train['review'])
max_len = max([len(s.split()) for s in X_train['review']])
X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
batch_size = 100
epochs = 3
model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)



In [None]:
X_test_seq = tokenizer.texts_to_sequences(X_test['review'])
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
pred = model.predict(X_test_pad)
y_pred = (pred > 0.5)
y_pred = y_pred.flatten()*1
results = pd.DataFrame({"id": id_test, "sentiment": y_pred})
results.to_csv("submission.csv", index=False)

# Conv1D + Glove Embedding

In [7]:

# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
    return embedding



# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 200))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix



In [None]:
X_train,X_test,y_train,id_test = load_all()
X_train,X_test= prepare()
max_features = 20000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train['review'])
list_tokenized_train = tokenizer.texts_to_sequences(X_train['review'])
max_len = max([len(s.split()) for s in X_train['review']])
X_t = pad_sequences(list_tokenized_train, maxlen=max_len)

raw_embedding = load_embedding('glove.6B.200d.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
vocab_size = len(tokenizer.word_index) + 1
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_vectors], input_length=max_len, trainable=False)


In [60]:

model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
batch_size = 100
epochs = 5
model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)


Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fd042f5ea50>

In [None]:
X_test_seq = tokenizer.texts_to_sequences(X_test['review'])
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
pred = model.predict(X_test_pad)
y_pred = (pred > 0.5)
y_pred = y_pred.flatten()*1
results = pd.DataFrame({"id": id_test, "sentiment": y_pred})
results.to_csv("submission.csv", index=False)

## Word2Vec Embedding - 0.8347 (do poprawy)

In [6]:
df2 = pd.read_csv('./unlabeledTrainData.tsv', delimiter='\t',error_bad_lines=False)
X_train,X_test,y_train,id_test = load_all()
df2.drop(['id'],axis=1,inplace=True)
df2.columns = ['review']
all_rev = pd.concat([X_train,df2])

b'Skipping line 43043: expected 2 fields, saw 3\n'


In [7]:

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)


In [8]:

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [9]:
sentences = []
for review in all_rev["review"]:
    sentences +=review_to_sentences(review, tokenizer) 


In [10]:
%%time
bigrams = Phrases(sentences=sentences)

CPU times: user 1min 6s, sys: 469 ms, total: 1min 6s
Wall time: 1min 7s


In [11]:
%%time
trigrams = Phrases(sentences=bigrams[sentences])

CPU times: user 2min 46s, sys: 322 ms, total: 2min 46s
Wall time: 2min 46s


In [16]:
%%time
embedding_vector_size = 256
trigrams_model = Word2Vec(
    sentences = trigrams[bigrams[sentences]],
    size = embedding_vector_size,
    min_count=3, window=5, workers=4)
print("Vocabulary size:", len(trigrams_model.wv.vocab))
trigrams_model_name = "256features"
trigrams_model.save(trigrams_model_name)

In [12]:
model1 = Word2Vec.load("./256features")

In [13]:
X_train,X_test,y_train,id_test = load_all()
X_train,X_test= prepare()
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train['review'])
list_tokenized_train = tokenizer.texts_to_sequences(X_train['review'])
max_len = 200
X_t = pad_sequences(list_tokenized_train, maxlen=max_len)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
embedding_matrix = model1.wv.vectors,
model = Sequential()
model.add(Embedding(
    input_dim = embedding_matrix[0].shape[0],
    output_dim = embedding_matrix[0].shape[1],
    input_length = max_len,
    weights = [embedding_matrix[0]],
    trainable=False))
model.add(Bidirectional(LSTM(128, recurrent_dropout=0.1)))
model.add(Dropout(0.25))
model.add(Dense(64))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
#model.summary()
batch_size = 100
epochs = 10
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)


In [18]:
X_test_seq = tokenizer.texts_to_sequences(X_test['review'])
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
pred = model.predict(X_test_pad)
y_pred = (pred > 0.5)
y_pred = y_pred.flatten()*1
results = pd.DataFrame({"id": id_test, "sentiment": y_pred})
results.to_csv("submission.csv", index=False)