Fuentes:
http://www.offconvex.org/2018/06/17/textembeddings/
https://github.com/dataiku/dataiku-contrib/blob/master/sentence-embedding/python-lib/sentence_embedding_utils.py
http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
import xgboost
from gensim.models import KeyedVectors
#%load_ext autoreload
#%autoreload 0

In [2]:
train = pd.read_csv('../data/train_reliable_norm.csv',header=None,names=['title'])
val = pd.read_csv('../data/val_reliable_norm.csv',header=None,names=['title'])
test = pd.read_csv('../data/test_full_norm.csv',header=None,names=['title'])

In [3]:
train['category'] = train['title'].apply(lambda x: ''.join(x.split()[0][9:]))
train['title'] = train['title'].apply(lambda x: ' '.join(x.split()[1:]))

In [4]:
val['category'] = val['title'].apply(lambda x: ''.join(x.split()[0][9:]))
val['title'] = val['title'].apply(lambda x: ' '.join(x.split()[1:]))

In [13]:
y_train = train['title']
y_val = val['category']

In [5]:
model = KeyedVectors.load_word2vec_format('../models/model_full_100.vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [1]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [6]:
from sentence_embedding import EmbeddingModel, preprocess_and_compute_sentence_embedding

In [7]:
model = EmbeddingModel('/home/franco_camporeale/models/emb', False)

[SENTENCE EMBEDDING] 2019-09-25 13:40:46,530 - root - INFO - Loaded 100000 word embeddings
[SENTENCE EMBEDDING] 2019-09-25 13:41:07,074 - root - INFO - Loaded 200000 word embeddings
[SENTENCE EMBEDDING] 2019-09-25 13:41:27,196 - root - INFO - Loaded 300000 word embeddings
[SENTENCE EMBEDDING] 2019-09-25 13:41:47,686 - root - INFO - Loaded 400000 word embeddings
[SENTENCE EMBEDDING] 2019-09-25 13:42:08,288 - root - INFO - Loaded 500000 word embeddings
[SENTENCE EMBEDDING] 2019-09-25 13:42:28,764 - root - INFO - Loaded 600000 word embeddings
[SENTENCE EMBEDDING] 2019-09-25 13:42:47,023 - root - INFO - Successfully loaded 689041 word embeddings!


In [8]:
texts = train["title"].to_list()

In [9]:
%time result = preprocess_and_compute_sentence_embedding(texts[:500000],model, 'SIF', 0.1, 1)

[SENTENCE EMBEDDING] 2019-09-25 13:44:57,136 - root - INFO - Computing weighted average embeddings...
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
[SENTENCE EMBEDDING] 2019-09-25 13:45:20,646 - root - INFO - Removing vectors first principal component...
[SENTENCE EMBEDDING] 2019-09-25 13:45:22,927 - root - INFO - (499997, 300)


CPU times: user 41.9 s, sys: 8.17 s, total: 50.1 s
Wall time: 36.1 s


In [10]:
for i, r in enumerate(result):
    if isinstance(r, float):
        print(i)

74828
212444
300903


In [11]:
# hay 3 sentencias que volvieron con un unico valor float 0 en vez de un array largo 300
for i, r in enumerate(result):
    if isinstance(r, float):
        result[i] = np.zeros(300)

In [14]:
y = y_train[:500000]

In [19]:
lr = SGDClassifier(loss='log', n_jobs=8)

In [None]:
%time lr.fit(result, y)

In [None]:
from joblib import load, dump
dump(lr, '../models/lr_first_test')


In [None]:
texts_val = val['title'].to_list()
result_validation = preprocess_and_compute_sentence_embedding(texts_val,model, 'SIF', 0.1, 1)

In [54]:
val_pred = lr.predict(x_val)

In [78]:
#print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_train, yTrainPredict))
print("Balanced Accuracy Score: %.2f" % balanced_accuracy_score(y_val, yPrediction))

In [74]:
result[0].shape

(300,)