In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import datetime
import os
np.random.seed(123)

# Ideas
### preprocessing:
...

##### labels:
create hierarchical labels ->  first is it positive/negative/neutral and then  ...

### models:
* TFIDF + simple model (logreg = 0.61081 on test)

embedding - pre trained / trained on train data / pre trained + fine tuned
* Word2Vec + avg/sum + simple model
* FastText + avg/sum + simple model
* Doc2Vec + avg/sum + simple model
* Glove + avg/sum + simple model

* best embedding + LSTM

* ensemble?

In [226]:
train = pd.read_csv(os.path.join('raw_data', 'train.tsv'), sep='\t', index_col='PhraseId')
test = pd.read_csv(os.path.join('raw_data', 'test.tsv'), sep='\t', index_col='PhraseId')
train.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [235]:
train['len'] = train['Phrase'].str.split().str.len()
train['Phrase'] = train['Phrase'].str.lower()
train.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,len
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,a series of escapades demonstrating the adage ...,1,37
2,1,a series of escapades demonstrating the adage ...,2,14
3,1,a series,2,2
4,1,a,2,1
5,1,series,2,1


In [256]:
train.iloc[:2, 1].values

array(['a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
       'a series of escapades demonstrating the adage that what is good for the goose'],
      dtype=object)

In [209]:
train['len'].describe()

count    156060.000000
mean          7.203364
std           7.024604
min           0.000000
25%           2.000000
50%           5.000000
75%          10.000000
max          52.000000
Name: len, dtype: float64

In [227]:
# sns.distplot(train['len'])
# set(train[train['len'] <= 1]['Phrase'])

In [241]:
import nltk
from nltk.corpus import stopwords
english_stopwords = list(stopwords.words('english'))

In [229]:
np.random.seed(123)
X = train['Phrase'] #.str.split()
y = train['Sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [234]:
np.random.seed(123)

model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=english_stopwords)),
    ('logreg',LogisticRegression())
])

model.fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)



(0.6615746080567303, 0.6216327053697296)

In [156]:
all_text = ' '.join(train['Phrase'])

In [163]:
import re
# set(re.findall(r'[^\w\s]', all_text))
# re.sub(r'[^\w\s]', '', all_text)

In [260]:
np.random.seed(123)
X = (train['Phrase']
    .str.lower()
    .str.replace(r'[^\w\s]', '')
    )
#     .str.split())

y = train['Sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [222]:
from sklearn.base import TransformerMixin

class MeanEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = word2vec.wv.vector_size

    def get_embedding(words):
        embeddings = []
        for w in words:
            if w in self.word2vec.wv.vocab:
                 word_value = self.word2vec.wv[w]
            else:
                word_value = np.zeros(self.dim)
                
            embeddings.append(word_value)
        return embeddings
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        return np.array([
            np.mean(embeddings, axis=0)
            for words in X
        ])

In [223]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(x_train.tolist(), size=128, window=5, min_count=1, workers=3)
w2v_model

<gensim.models.word2vec.Word2Vec at 0x7fd9101188d0>

In [224]:
np.random.seed(123)

model = Pipeline([
    ('words2vec', MeanEmbeddingVectorizer(w2v_model)),
    ('logreg', LogisticRegression())
])

model.fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)



(0.4823595741834375, 0.4823691302104237)

In [259]:
from tensorflow import keras
# keras.preprocessing.text.Tokenizer()
# keras.preprocessing.sequence.pad_sequences()

In [272]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=50, lower=True,split=' ')
tokenizer.fit_on_texts(x_train.values)

In [273]:
X = tokenizer.texts_to_sequences(x_train.values)
X = keras.preprocessing.sequence.pad_sequences(X)

In [275]:
X.shape

(117045, 25)