In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

#%matplotlib notebook

import sklearn
import matplotlib as mpl
import seaborn as sns

import re

%matplotlib inline

pd.options.display.max_rows = 10

In [2]:
# Dataset

# use this if in fmi-hw... repo
# train = pd.read_csv("data/spooky/train.zip", index_col=['id'])
# test = pd.read_csv("data/spooky/test.zip", index_col=['id'])
# sample_submission = pd.read_csv("data/spooky/sample_submission.zip", index_col=['id'])

train = pd.read_csv("data/train.zip", index_col=['id'])
test = pd.read_csv("data/test.zip", index_col=['id'])
sample_submission = pd.read_csv("data/sample_submission.zip", index_col=['id'])


print(train.shape, test.shape, sample_submission.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1) (8392, 3)
{'author'}


# ~~Първо - baseline модел~~

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import nltk
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [4]:
explore = train.copy()
stem = PorterStemmer()
explore['stemmed'] = explore.text.apply(lambda t: " ".join([stem.stem(w) for w in t.split()])) 
explore[['stemmed', 'text']].head()

Unnamed: 0_level_0,stemmed,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"thi process, however, afford me no mean of asc...","This process, however, afforded me no means of..."
id17569,It never onc occur to me that the fumbl might ...,It never once occurred to me that the fumbling...
id11008,"In hi left hand wa a gold snuff box, from whic...","In his left hand was a gold snuff box, from wh..."
id27763,how love is spring As we look from windsor ter...,How lovely is spring As we looked from Windsor...
id12958,"find noth else, not even gold, the superintend...","Finding nothing else, not even gold, the Super..."


# ~~Допълнителните фичъри не сработиха, стеминга също.~~

Остават да пробвам:

* ~~Оптимизиране на модела с CountVectorizer.~~
* Добавяне на още фичъри, от латентни пространства (LDA) - topic modeling.
* Word embeddings с невронни мрежи.
* Стакинг на класификатори.

За сега ще разгледаме само оптимизирането на модела.

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB

from gensim import corpora

documents = train.text

The entire processing of the texts has to be automated via pipeline interface - tokenization, create dict, doc2bow/text2bow, ...

In [6]:
import pickle
from pathlib import Path
from pprint import pprint  # pretty-printer

TOKENIZED_FILENAME = "processed_texts.pickle"

try:
    # depicke
    texts = pickle.load(open(TOKENIZED_FILENAME, "rb"))   
except FileNotFoundError:
    # remove common words and tokenize
    # stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document.lower().split() if word not in stopwords] #was stoplist
             for document in documents]

    # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    
    # create pickle
    pickle.dump(texts, open(TOKENIZED_FILENAME, "wb" ))

pprint(texts[:2])    

[['process,',
  'however,',
  'afforded',
  'means',
  'ascertaining',
  'dimensions',
  'might',
  'make',
  'return',
  'point',
  'whence',
  'set',
  'out,',
  'without',
  'aware',
  'perfectly',
  'uniform',
  'seemed',
  'wall.'],
 ['never', 'occurred', 'fumbling', 'might', 'mere', 'mistake.']]


In [7]:
DICT_FILENAME = 'spooky.dict'

try:
    dictionary = corpora.Dictionary.load(DICT_FILENAME)
except FileNotFoundError:
    dictionary = corpora.Dictionary(texts)
    dictionary.save(DICT_FILENAME)  # store the dictionary, for future reference

print(len(dictionary))

22328


In [8]:
CORPUS_FILENAME = 'spooky.mm'

try:
    #restore from disk
    corpus = corpora.mmcorpus.MmCorpus(CORPUS_FILENAME)
except FileNotFoundError:
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('spooky.mm', corpus)  # store to disk, for later use

print(corpus[156])

[(545, 1.0), (1137, 1.0), (1598, 1.0), (1599, 1.0), (1600, 1.0), (1601, 1.0)]


In [9]:
print('in df:     ', train.text[0],'\n')
print('as doc:    ', documents[0],'\n')
print('as tokens: ', texts[0],'\n')
print('as vec:    ', corpus[0],'\n')

in df:      This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall. 

as doc:     This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall. 

as tokens:  ['process,', 'however,', 'afforded', 'means', 'ascertaining', 'dimensions', 'might', 'make', 'return', 'point', 'whence', 'set', 'out,', 'without', 'aware', 'perfectly', 'uniform', 'seemed', 'wall.'] 

as vec:     [(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0)] 



In [13]:
from gensim import sklearn_api

In [14]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
#     ('clf', RandomForestClassifier())
])


In [15]:
%%time
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)


[-58670.90445705 -64169.14896674 -65322.97842153]
CPU times: user 321 ms, sys: 154 ms, total: 474 ms
Wall time: 2min 39s


running only Text2BowTransformer()) + LdaTransformer()) gives

[-58670.90445705 -64169.14896674 -65322.97842153]<br>
CPU times: user 321 ms, sys: 154 ms, total: 474 ms<br>
Wall time: 2min 39s

So the tricky part is the outpurt given by the LDA, and how is that used by the subsequent Pipes. The [docs](https://radimrehurek.com/gensim/sklearn_api/ldamodel.html) state:

*Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). The input docs should be in BOW format and can be a list of documents like [[(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)]]*

It's totally unclear what a LogisticRegression will pick from a "matrix of topic distribution". Let's try to see it.

In [None]:
bows = sklearn_api.text2bow.Text2BowTransformer().fit_transform(train.text)

In [23]:
print(train.text[656], bows[656])

Dropping of its own accord upon his exit or perhaps purposely closed, it had become fastened by the spring; and it was the retention of this spring which had been mistaken by the police for that of the nail, farther inquiry being thus considered unnecessary. [(3, 1), (7, 1), (13, 1), (20, 3), (29, 4), (44, 1), (60, 1), (70, 1), (71, 1), (78, 2), (95, 2), (133, 1), (143, 2), (174, 1), (178, 1), (201, 2), (220, 1), (321, 1), (323, 1), (352, 1), (403, 1), (542, 1), (1485, 1), (1779, 1), (1798, 1), (1873, 1), (2219, 1), (2889, 1), (4672, 1), (4673, 1), (4674, 1), (4675, 1), (4676, 1), (4677, 1), (4678, 1), (4679, 1)]


In [24]:
lda = sklearn_api.ldamodel.LdaTransformer().fit_transform(bows)

  diff = np.log(self.expElogbeta)


In [25]:
print(train.text[656], bows[656], lda[bows[656]])

Dropping of its own accord upon his exit or perhaps purposely closed, it had become fastened by the spring; and it was the retention of this spring which had been mistaken by the police for that of the nail, farther inquiry being thus considered unnecessary. [(3, 1), (7, 1), (13, 1), (20, 3), (29, 4), (44, 1), (60, 1), (70, 1), (71, 1), (78, 2), (95, 2), (133, 1), (143, 2), (174, 1), (178, 1), (201, 2), (220, 1), (321, 1), (323, 1), (352, 1), (403, 1), (542, 1), (1485, 1), (1779, 1), (1798, 1), (1873, 1), (2219, 1), (2889, 1), (4672, 1), (4673, 1), (4674, 1), (4675, 1), (4676, 1), (4677, 1), (4678, 1), (4679, 1)] [[[ 0.          0.03539791  0.         ...,  0.          0.          0.        ]
  [ 0.          0.          0.         ...,  0.          0.          0.10721884]]

 [[ 0.          0.55708617  0.         ...,  0.          0.          0.        ]
  [ 0.          0.          0.         ...,  0.          0.          0.10721884]]

 [[ 0.          0.          0.         ...,  0.    

In [26]:
lda

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.10721884],
       [ 0.09975109,  0.02642765,  0.        , ...,  0.        ,
         0.03326323,  0.        ],
       ..., 
       [ 0.20021059,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.07886007],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [16]:
fail here

for now

SyntaxError: invalid syntax (<ipython-input-16-b80ac4c78243>, line 1)

[ 0.48360907  0.48850751  0.48613027]<br>
[-2.1033608  -2.09791398 -2.30065565]

Преди да пообработим текста да видим Logistic Regression

In [None]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', LogisticRegression())
])


In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

[ 0.53722426  0.53463071  0.53563218]<br>
CPU times: user 430 ms, sys: 169 ms, total: 599 ms<br>
Wall time: 3min 41s

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.96678405 -0.97080789 -0.97363342]<br>
CPU times: user 453 ms, sys: 121 ms, total: 573 ms<br>
Wall time: 3min 39s

Още един опит преди да окастрим текста

In [None]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer(num_topics=21)),
    ('clf', LogisticRegression())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.98861154 -0.97565475 -0.99821168]<br>
CPU times: user 381 ms, sys: 99.9 ms, total: 481 ms<br>
Wall time: 2min 8s

In [None]:
explore.stemmed

In [None]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords

# CUSTOM_FILTERS = [strip_punctuation, remove_stopwords]
# explore['stemmed2'] = [ preprocess_string(s, CUSTOM_FILTERS) for s in explore.stemmed]
# explore.stemmed2

explore['stemmed2'] = [ strip_punctuation(remove_stopwords(s.lower())) for s in explore.stemmed]
explore.stemmed2

In [None]:
print(explore.loc['id17569'].stemmed, '|||', explore.loc['id17569'].stemmed2)

Pipeline-a си е същия

In [None]:
# %%time
# print(cross_val_score(pipeline, explore.stemmed2, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.05550683 -1.02313271 -1.02899542]<br>
CPU times: user 347 ms, sys: 92.8 ms, total: 440 ms<br>
Wall time: 1min 27s

Поне е по-бързо. Да видим tfidf и повече фичъри.

In [None]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer(num_topics=501)),
    ('clf', LogisticRegression())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.94941507 -0.95248997 -0.95026712]<br>
CPU times: user 1.18 s, sys: 223 ms, total: 1.4 s<br>
Wall time: 19min 4s

This is frustrating

In [None]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', LogisticRegression())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.03061917 -1.03984701 -1.03884094]<br>
CPU times: user 387 ms, sys: 124 ms, total: 511 ms<br>
Wall time: 2min 33s

In [None]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', MultinomialNB())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.06731359 -1.07019065 -1.07019197]<br>
CPU times: user 377 ms, sys: 142 ms, total: 519 ms<br>
Wall time: 2min 29s

От къде е тая разлика? Може ли text2bow да чупи? Или е от липсата на хипер парамерти?

In [None]:
print(corpus)

corpus е минат с word2vec; ще прескоча tfidf за момента и ще пусна LdaModel

In [None]:
from gensim.models.ldamodel import LdaModel

# lda = LdaModel(corpus=corpus, id2word=dictionary)
# lda.print_topic(0)

In [None]:
# id2word=dict([(i, s) for i, s in enumerate(dictionary)])
# print([ id2word[z] for z in range(10) ])
# print(list(id2word.items())[:10])
# what = list(enumerate(dictionary))
# print(what[:10])
# print(list(dictionary)[:10])
# print(type(dictionary))
# wdk = dictionary.keys()
# print(wdk[:10])
# print(dictionary[wdk[0]])

Горното даваше topic с референции към IDта вместо към реални думи - не знам защо сега работи

In [None]:
%%time
lda = LdaModel(corpus=corpus, id2word=dict(dictionary.items()))

Представянето на topic:

In [None]:
lda.print_topic(0)

In [None]:
lda.show_topics(num_topics=10, num_words=10)

Представянето на запис в латентното пространство. Векторите са 100, но Gensim показва само най-значимите:

In [None]:
lda[corpus[0]]

In [None]:
lda.get_document_topics(corpus[1], minimum_probability=0.00001)

In [None]:
print(len(corpus), len(train.author))

apply logistic regression, but first encode topics in dataframe

In [None]:
latently = train.copy()
for t in range(100):
    latently[t]=0

print(latently[list(range(100))])

In [None]:
data = []
for i, doc in enumerate(corpus):
    temp = [0]*100
    for t in lda[doc]:
        temp[t[0]] = t[1]
    data.append(temp)

In [None]:
for t in range(100):
    latently[t] = [ data[z][t] for z in range(len(data)) ]

print(latently[list(range(100))])

In [None]:
AUTHOR_ENCODING = {'EAP': 0, 'MWS': 1, 'HPL': 2}
latently['encoded_author'] = [ AUTHOR_ENCODING[z] for z in train.author ]

In [None]:
latently.shape

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

X_train, X_test, y_train, y_test = train_test_split(latently[list(range(100))], latently.encoded_author, random_state=42)

In [None]:
logreg = LogisticRegression(C=0.01)

In [None]:
%%time
logreg.fit(X_train, y_train)

In [None]:
print("Train score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Test  score: {:.2f}".format(logreg.score(X_test, y_test)))
prediction = logreg.predict_proba(X_test)
logloss = log_loss(y_test, prediction, labels=[0, 1, 2])
print('Log Loss score', logloss)

In [None]:
print(prediction[:10])

In [None]:
print(X_test[:10])

In [None]:
print([ list(z).index(max(z)) for z in prediction[:10] ], y_test[:10])

Maybe we have way too many topics for this to work, let's see what the efect of only 3 topics is. I'll use the manual approach instead of Pipleline, since I don't know how data conversion is being handled.

In [None]:
%%time
lda3 = LdaModel(corpus=corpus, num_topics=3, id2word=dict(dictionary.items()))

In [None]:
late3 = train.copy()

data = []
for i, doc in enumerate(corpus):
    temp = [0, 0, 0]
    for t in lda3[doc]:
        temp[t[0]] = t[1]
    data.append(temp)

late3[0] = [ data[z][0] for z in range(len(data)) ]
late3[1] = [ data[z][1] for z in range(len(data)) ]
late3[2] = [ data[z][2] for z in range(len(data)) ]

print(late3)

In [None]:
late3['encoded_author'] = [ AUTHOR_ENCODING[z] for z in train.author ]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(late3[[0,1,2]], late3.encoded_author, random_state=42)

In [None]:
%%time
logreg.fit(X_train, y_train)

In [None]:
print("Train score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Test  score: {:.2f}".format(logreg.score(X_test, y_test)))
prediction = logreg.predict_proba(X_test)
logloss = log_loss(y_test, prediction, labels=[0, 1, 2])
print('Log Loss score', logloss)

What if we dont use the LDA on it's own, but as extra features on top of the CountVectorizer? 

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2))

In [None]:
vectorized = vectorizer.fit_transform(train.text).todense()

Will use approach from http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html in another NB

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

In [None]:
fail here

In [None]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Dropout, Lambda, Concatenate
from keras.optimizers import RMSprop, Adam
from keras import regularizers, objectives, metrics
from keras.models import Model
from keras import backend as K

from IPython.display import display

# Откри приблизително същите параметри, но не успя да стигне напълно до същия резултат.

Ще използвам следния модел:

TfIdf + MultinomialNB, без стеминг на текста.

Mean validation score: -0.423 (std: 0.003)

Ще ползвам и следните параметри:

Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}


Последна проверка на този модел за `LogLoss` и `Accuracy`

In [None]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

# Трениране на модел и събмит

Първо да видим в какъв формат трябва да се подадат резултатите за тест

In [None]:
sample_submission = pd.read_csv("data/spooky-authors/sample_submission.zip")
sample_submission.head()

In [None]:
pipeline = pipeline.fit(train.text, train.author)

In [None]:
print(pipeline.predict_proba(test[:10].text))

In [None]:
test_predictions = pipeline.predict_proba(test.text)

In [None]:
print(pipeline.classes_)

In [None]:
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.head(10)

In [None]:
submit_file.to_csv("data/spooky-authors/submit_Tfidf_MNB_text.csv")

Очакванията за събмита са да имаме скор някъде около 0.41 - 0.42.

Може да е малко по-добър защото при крос-валидацията тренирахме на 13к и тествахме 6к.

Сега трейн сета е целия: 19.5к

![submit-result.png](attachment:submit-result.png)

In [None]:
# Да хакнем ранкинга в кагъл?

print(test.text[:5].values)