In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb
from tensorflow import keras
from sklearn.base import TransformerMixin
import re
import gensim
import nltk
from nltk.corpus import stopwords as nltk_stopwords
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import datetime
import os
np.random.seed(123)

# Ideas
### preprocessing:
...

##### labels:
create hierarchical labels ->  first is it positive/negative/neutral and then  ...

### models:
* TFIDF + simple model (logreg = 0.61081 on test)
* TFIDF + FC

embedding - pre trained / trained on train data / pre trained + fine tuned
* Doc2Vec + simple model
* Word2Vec + avg/sum + simple model
* FastText + avg/sum + simple model
* Glove + avg/sum + simple model

* best embedding + LSTM

* ensemble?

In [None]:
train = pd.read_csv(os.path.join('raw_data', 'train.tsv'), sep='\t', index_col='PhraseId')
test = pd.read_csv(os.path.join('raw_data', 'test.tsv'), sep='\t', index_col='PhraseId')
train.head()

# EDA

In [None]:
train.info()

In [None]:
train['len'] = train['Phrase'].str.split().str.len()
train['Phrase'] = train['Phrase'].str.lower()
train.head()

In [None]:
train.iloc[:2, 1].values

In [None]:
train['len'].describe()

In [None]:
# sns.distplot(train['len'])
# set(train[train['len'] <= 1]['Phrase'])

In [None]:
train.drop(train[train['len'] <= 1].index, inplace=True)

In [None]:
english_stopwords = list(nltk_stopwords.words('english'))

# Data Cleaning

# Simple TFIDF + Logistic Regression

In [None]:
np.random.seed(123)
X = train['Phrase']
y = train['Sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [None]:
np.random.seed(123)

model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=english_stopwords)),
#     ('LSA', TruncatedSVD(n_components=400)),
    ('logreg',LogisticRegression())
])

model.fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

# Simple TFIDF + FC nn

In [None]:
all_text = ' '.join(train['Phrase'])

In [None]:
import re
# set(re.findall(r'[^\w\s]', all_text))
# re.sub(r'[^\w\s]', '', all_text)

# Use Pre-trained W2V model + mean word weights + Logistic Regression

In [None]:
class MeanEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.wv.vector_size  #  if a text is empty we should return a vector of zeros with the same dimensionality as all the other vectors

    @staticmethod
    def my_toknizer(sentence):
        sentence = sentence.lower()
        sentence = re.sub(r'[^\w\s]', '', sentence)
        return nltk.tokenize.word_tokenize(sentence)

    def get_embedding(self, sentence):
        embeddings = []
        for word in nltk.tokenize.word_tokenize(sentence.strip()):
            if word in self.word2vec.wv.vocab:
                word_value = self.word2vec.wv[word]
            else:
                word_value = np.zeros(self.dim)

            embeddings.append(word_value)
    
        return embeddings
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        emb = [np.mean(self.get_embedding(sentence), axis=0) for sentence in X]
        return np.stack(emb)
        
        return np.array([
            np.mean(self.get_embedding(words), axis=0)
            for words in X
         ]).reshape(-1, 1)

In [None]:
import gensim.downloader as api
pretrained_model_path = api.load('word2vec-google-news-300', return_path=True)
pretrained_model_path

In [None]:
# w2v_model = gensim.modelsWord2Vec(x_train.tolist(), size=128, window=5, min_count=1, workers=3)
# w2v_model

In [None]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

In [None]:
w2v_model

In [None]:
np.random.seed(123)
X = (train['Phrase']
    .str.lower()
    .str.replace(r'[^\w\s]', '')
#     .str.replace('|'.join([r'[\s\b]{}[\s\b]'.format(w) for w in english_stopwords]), '') # stop words
    )
y = train['Sentiment']

# Drop rows with lower number of words than 2:
idxs = X[X.str.split().str.len() < 2].index
X.drop(idxs, inplace=True)
y.drop(idxs, inplace=True)


x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [None]:
np.random.seed(123)

model = Pipeline([
    ('words2vec', MeanEmbeddingVectorizer(w2v_model)),
    ('logreg', LogisticRegression())
])

model.fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

# Use Pre-trained W2V model + mean word weights + FC nn

In [None]:
# def my_toknizer(sentence):
#         sentence = sentence.lower()
#         sentence = re.sub(r'[^\w\s]', '', sentence)
#         return nltk.tokenize.word_tokenize(sentence)

# def get_embedding(word2vec, sentence):
#     embeddings = []
#     for word in my_toknizer(sentence.strip()):
#         if word in word2vec.wv.vocab:
#             word_value = word2vec.wv[word]
#         else:
#             word_value = np.zeros(word2vec.wv.vector_size)

#         embeddings.append(word_value)
    
#     if embeddings:
#         return embeddings
#     else:
#         return [0]*word2vec.wv.vector_size

# for i in X:
#     embs = get_embedding(w2v_model, i)
#     emb = np.mean(embs, axis=0)
#     if emb.shape != (300,):
#         print(i, emb.shape)

# LSTM without embedding

In [None]:
max_vocab_word = 100
max_sequence_length = None

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_vocab_word, lower=True, split=' ')
tokenizer.fit_on_texts(x_train)

In [None]:
_x_train = tokenizer.texts_to_sequences(x_train)
_x_train = keras.preprocessing.sequence.pad_sequences(_x_train, maxlen=max_sequence_length, padding='post', truncating='post')

_x_test = tokenizer.texts_to_sequences(x_test)
_x_test = keras.preprocessing.sequence.pad_sequences(_x_test, maxlen=max_sequence_length, padding='post', truncating='post')

In [None]:
_x_test

In [None]:
def RnnBuild(max_words, embed_dim):
    clf = keras.models.Sequential([
        keras.layers.Embedding(max_words, output_dim=embed_dim),
        keras.layers.Bidirectional(
            keras.layers.LSTM(embed_dim)
        ),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(5, activation='softmax')
    ])
    
    clf.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

    return clf

In [None]:
rnn_model = Pipeline([
    ('keras', keras.wrappers.scikit_learn.KerasClassifier(RnnBuild,
                                                          max_words=max_vocab_word,
                                                          embed_dim=128,
                                                          epochs=10,
                                                          batch_size=256,
                                                          validation_split=0.1,
                                                          callbacks=[
                                                              keras.callbacks.EarlyStopping(patience=5),
                                                              keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, min_lr=0)
                                                                    ],
                                                          verbose=1))
                                  
])

print(RnnBuild(max_words=max_vocab_word, embed_dim=128).summary())

In [None]:
rnn_model.fit(_x_train, y_train)

In [None]:
rnn_model.score(_x_train, y_train), rnn_model.score(_x_test, y_test)

# LSTM with the best embedding