In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

#%matplotlib notebook

import sklearn
import matplotlib as mpl
import seaborn as sns

import re

%matplotlib inline

pd.options.display.max_rows = 10

In [2]:
# Dataset

# use this if in fmi-hw... repo
# train = pd.read_csv("data/spooky/train.zip", index_col=['id'])
# test = pd.read_csv("data/spooky/test.zip", index_col=['id'])
# sample_submission = pd.read_csv("data/spooky/sample_submission.zip", index_col=['id'])

train = pd.read_csv("data/train.zip", index_col=['id'])
test = pd.read_csv("data/test.zip", index_col=['id'])
sample_submission = pd.read_csv("data/sample_submission.zip", index_col=['id'])


print(train.shape, test.shape, sample_submission.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1) (8392, 3)
{'author'}


In [3]:
train.head(5)

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


# Идеи за фичъри:
    
* ~~CountVectorizer, Tfidf~~
* ~~Preprocessing - stop words, lematization~~
* ~~Други фичъри - бр. думи , бр. стоп думи, бр. пунктуация, бр. ГЛАВНИ букви и т.н.~~
* Намиране на общи теми чрез LDA
* Word Embeddings с невронни мрежи


# Първо - baseline модел

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

In [5]:
pipeline = Pipeline([
    ('features', CountVectorizer()),
    ('clf', LinearSVC())
])

cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3)

array([ 0.78783701,  0.79635305,  0.79509579])

Да пробваме с RF

In [6]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('features', CountVectorizer()),
    ('clf', RandomForestClassifier())
])

# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[ 0.62300858  0.61798958  0.60551724]

[-1.36418837 -1.38716468 -1.43783028]

In [7]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('features', CountVectorizer()),
    ('clf', LogisticRegression())
])

# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
#                       scoring='neg_log_loss'))

# Получихме малко по-добри резултати

[ 0.81449142  0.81673307  0.81348659]

[-0.47678328 -0.47558895 -0.47131481]

Следващото е за да имаме "explore" датасет

In [8]:
explore = train.copy()

In [9]:
import nltk
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stem = PorterStemmer()

explore['stemmed'] = explore.text.apply(lambda t: " ".join([stem.stem(w) for w in t.split()])) 

explore[['stemmed', 'text']].head()

Unnamed: 0_level_0,stemmed,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"thi process, however, afford me no mean of asc...","This process, however, afforded me no means of..."
id17569,It never onc occur to me that the fumbl might ...,It never once occurred to me that the fumbling...
id11008,"In hi left hand wa a gold snuff box, from whic...","In his left hand was a gold snuff box, from wh..."
id27763,how love is spring As we look from windsor ter...,How lovely is spring As we looked from Windsor...
id12958,"find noth else, not even gold, the superintend...","Finding nothing else, not even gold, the Super..."


# Допълнителните фичъри не сработиха, стеминга също. 

Остават да пробвам:

* ~~Оптимизиране на модела с CountVectorizer.~~
* Добавяне на още фичъри, от латентни пространства (LDA) - topic modeling.
* Word embeddings с невронни мрежи.
* Стакинг на класификатори.

За сега ще разгледаме само оптимизирането на модела.

Първо да опишем параметрите за търсене в трансфомацията (CountVectorizer)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

params_count_word = {"features__ngram_range": [(1,1), (1,2), (1,3)],
                      "features__analyzer": ['word'],
                      "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "features__min_df":[2, 3, 5, 10],
                      "features__lowercase": [False, True],
                      "features__stop_words": [None, stopwords]}

params_count_char = {"features__ngram_range": [(1,4), (1,5), (1,6)],
                      "features__analyzer": ['char'],
                      "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "features__min_df":[2, 3, 5, 10],
                      "features__lowercase": [False, True],
                      "features__stop_words": [None, stopwords]}

In [12]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [13]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB

In [14]:
def random_search():
    params = {
        "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3]
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('features', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=4)

    random_search.fit(explore.stemmed, train.author)
    report(random_search.cv_results_)
    
# random_search()  # -0.423

Model with rank: 1
Mean validation score: -0.438 (std: 0.002)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.6, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 2
Mean validation score: -0.443 (std: 0.004)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 3, 'features__max_df': 0.6, 'features__lowercase': True, 'features__analyzer': 'word', 'clf__alpha': 0.05}

Model with rank: 3
Mean validation score: -0.453 (std: 0.002)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 2, 'features__max_df': 1.0, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 4
Mean validation score: -0.471 (std: 0.003)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 5, 'features__max_df': 1.0, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}

Model with rank: 5
Mean validation score: -0.472 (std: 0.004)
Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 3), 'features__min_df': 5, 'features__max_df': 0.5, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.05}


In [15]:
# import keras
# from keras.datasets import mnist
# from keras.models import Sequential
# from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Dropout, Lambda, Concatenate
# from keras.optimizers import RMSprop, Adam
# from keras import regularizers, objectives, metrics
# from keras.models import Model
# from keras import backend as K

# from IPython.display import display

In [16]:
from gensim import corpora

documents = train.text

In [17]:
# # remove common words and tokenize
# # stoplist = set('for a of the and to in'.split())
# texts = [[word for word in document.lower().split() if word not in stopwords] #was stoplist
#          for document in documents]

# # remove words that appear only once
# from collections import defaultdict
# frequency = defaultdict(int)
# for text in texts:
#     for token in text:
#         frequency[token] += 1

# texts = [[token for token in text if frequency[token] > 1] for text in texts]

# from pprint import pprint  # pretty-printer
# pprint(texts[:2])

In [18]:
import pickle
from pathlib import Path
from pprint import pprint  # pretty-printer

TOKENIZED_FILENAME = "processed_texts.pickle"

try:
    # depicke
    texts = pickle.load(open(TOKENIZED_FILENAME, "rb"))   
except FileNotFoundError:
    # remove common words and tokenize
    # stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document.lower().split() if word not in stopwords] #was stoplist
             for document in documents]

    # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1] for text in texts]

    from pprint import pprint  # pretty-printer
    
    # create pickle
    pickle.dump(texts, open(TOKENIZED_FILENAME, "wb" ))

pprint(texts[:2])    

[['process,',
  'however,',
  'afforded',
  'means',
  'ascertaining',
  'dimensions',
  'might',
  'make',
  'return',
  'point',
  'whence',
  'set',
  'out,',
  'without',
  'aware',
  'perfectly',
  'uniform',
  'seemed',
  'wall.'],
 ['never', 'occurred', 'fumbling', 'might', 'mere', 'mistake.']]


In [19]:
DICT_FILENAME = 'spooky.dict'

try:
    dictionary = corpora.Dictionary.load(DICT_FILENAME)
except FileNotFoundError:
    dictionary = corpora.Dictionary(texts)
    dictionary.save(DICT_FILENAME)  # store the dictionary, for future reference

print(len(dictionary))

22328


In [20]:
CORPUS_FILENAME = 'spooky.mm'

try:
    #restore from disk
    corpus = corpora.mmcorpus.MmCorpus(CORPUS_FILENAME)
except FileNotFoundError:
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('spooky.mm', corpus)  # store to disk, for later use

print(corpus[156])

[(545, 1.0), (1137, 1.0), (1598, 1.0), (1599, 1.0), (1600, 1.0), (1601, 1.0)]


In [21]:
print('in df:     ', train.text[0],'\n')
print('as doc:    ', documents[0],'\n')
print('as tokens: ', texts[0],'\n')
print('as vec:    ', corpus[0],'\n')

in df:      This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall. 

as doc:     This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall. 

as tokens:  ['process,', 'however,', 'afforded', 'means', 'ascertaining', 'dimensions', 'might', 'make', 'return', 'point', 'whence', 'set', 'out,', 'without', 'aware', 'perfectly', 'uniform', 'seemed', 'wall.'] 

as vec:     [(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0)] 



## similarities.docsim.Similarity

In [22]:
from gensim import similarities

index = similarities.docsim.Similarity('/home/bob/tmp', corpus, num_features=len(dictionary)) 
sims = index[corpus[0]]

In [23]:
from operator import itemgetter

results = list(enumerate(sims))
top = sorted(results, key=itemgetter(1), reverse=True)
top[:5]

[(0, 0.99999982),
 (14771, 0.260133),
 (8578, 0.22941573),
 (12208, 0.22941573),
 (12524, 0.22941573)]

In [24]:
print('Comparing to document: ', documents[0], '\n\n')
for sim in top[:10]:
    print(sim[1], '|', train.author[sim[0]], '|', documents[sim[0]],'\n')

Comparing to document:  This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall. 


1.0 | EAP | This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall. 

0.260133 | EAP | The expression of his smile, however, was by no means unpleasing, as might be supposed; but it had no variation whatever. 

0.229416 | EAP | Much, however, might be ascertained. 

0.229416 | EAP | I reapproached the wall. 

0.229416 | HPL | They seemed to hate and fear him at the same time, and he seemed to return these sentiments. 

0.229416 | EAP | There was nothing in this, however, to make him sob. 

0.216295 | EAP | At first he stared at me as if he found it impossible

Чисто технически може да се направи някакво осредняване на резултати - например резултатите от сравнение към documents[0] и сравнение към documents[2] - като и двата записа са "EAP". Но не ми се вижда правилно.

Не мисля, че посоката е подходяща за нашия случай, защото сравненията са директни, вместо да се извлича "пространство" което да генерализира стила/автора. По-добре да проверим какво друго има в gensim.

In [25]:
from gensim import sklearn_api

from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('clf', RandomForestClassifier())
])

try:
    print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
    print(cross_val_score(pipeline, train.text, train.author, 
                    cv=3, n_jobs=3, scoring='neg_log_loss'))
except ValueError as e:
#     print(e)
    pass

ValueError: setting an array element with a sequence.

In [26]:
something = sklearn_api.text2bow.Text2BowTransformer()
print(type(something))
other = something.fit_transform(train.text)
print(type(other))
print(train.text[0], other[0], '\n', train.text[656], other[656])

<class 'gensim.sklearn_api.text2bow.Text2BowTransformer'>
<class 'list'>
This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall. [(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 3), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 4), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)] 
 Dropping of its own accord upon his exit or perhaps purposely closed, it had become fastened by the spring; and it was the retention of this spring which had been mistaken by the police for that of the nail, farther inquiry being thus considered unnecessary. [(3, 1), (7, 1), (13, 1), (20, 3), (29, 4), (44, 1), (60, 1), (70, 1), (71, 1), (78, 2), (95, 2), (133, 1), (143, 2), (174, 1), 

How is this supposed to work?!?

`tfidf.fit_transform(corpus)` returns matrix, while the `sklearn_api.text2bow.Text2BowTransformer().fit_transform(corpus)` returns some "BOW" format

OK - this might work

In [27]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', RandomForestClassifier())
])

# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[ 0.48360907  0.48850751  0.48613027]<br>
[-2.1033608  -2.09791398 -2.30065565]

Преди да пообработим текста да видим Logistic Regression

In [28]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', LogisticRegression())
])


In [29]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

[ 0.53722426  0.53463071  0.53563218]<br>
CPU times: user 430 ms, sys: 169 ms, total: 599 ms<br>
Wall time: 3min 41s

In [30]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.96678405 -0.97080789 -0.97363342]<br>
CPU times: user 453 ms, sys: 121 ms, total: 573 ms<br>
Wall time: 3min 39s

Още един опит преди да окастрим текста

In [31]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer(num_topics=21)),
    ('clf', LogisticRegression())
])

In [32]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.98861154 -0.97565475 -0.99821168]<br>
CPU times: user 381 ms, sys: 99.9 ms, total: 481 ms<br>
Wall time: 2min 8s

In [33]:
explore.stemmed

id
id26305    thi process, however, afford me no mean of asc...
id17569    It never onc occur to me that the fumbl might ...
id11008    In hi left hand wa a gold snuff box, from whic...
id27763    how love is spring As we look from windsor ter...
id12958    find noth else, not even gold, the superintend...
                                 ...                        
id17718    I could have fancied, while I look at it, that...
id08973     the lid clench themselv togeth as if in a spasm.
id05267    mai il faut agir that is to say, a frenchman n...
id17513    for an item of news like this, it strike us it...
id00393    He laid a gnarl claw on my shoulder, and it se...
Name: stemmed, Length: 19579, dtype: object

In [34]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords

# CUSTOM_FILTERS = [strip_punctuation, remove_stopwords]
# explore['stemmed2'] = [ preprocess_string(s, CUSTOM_FILTERS) for s in explore.stemmed]
# explore.stemmed2

explore['stemmed2'] = [ strip_punctuation(remove_stopwords(s.lower())) for s in explore.stemmed]
explore.stemmed2

id
id26305    thi process  however  afford mean ascertain di...
id17569                        onc occur fumbl mere mistake 
id11008    hi left hand wa gold snuff box  which  caper h...
id27763    love spring look windsor terrac sixteen fertil...
id12958    noth else  gold  superintend abandon hi attemp...
                                 ...                        
id17718    fancied  look it  emin landscap painter built ...
id08973                    lid clench themselv togeth spasm 
id05267      mai il faut agir say  frenchman faint outright 
id17513    item news like this  strike wa veri coolli rec...
id00393    laid gnarl claw shoulder  shake wa altogeth mi...
Name: stemmed2, Length: 19579, dtype: object

In [35]:
print(explore.loc['id17569'].stemmed, '|||', explore.loc['id17569'].stemmed2)

It never onc occur to me that the fumbl might be a mere mistake. ||| onc occur fumbl mere mistake 


Pipeline-a си е същия

In [36]:
# %%time
# print(cross_val_score(pipeline, explore.stemmed2, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.05550683 -1.02313271 -1.02899542]<br>
CPU times: user 347 ms, sys: 92.8 ms, total: 440 ms<br>
Wall time: 1min 27s

Поне е по-бързо. Да видим tfidf и повече фичъри.

In [37]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer(num_topics=501)),
    ('clf', LogisticRegression())
])

In [38]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.94941507 -0.95248997 -0.95026712]<br>
CPU times: user 1.18 s, sys: 223 ms, total: 1.4 s<br>
Wall time: 19min 4s

This is frustrating

In [39]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', LogisticRegression())
])

In [40]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.03061917 -1.03984701 -1.03884094]<br>
CPU times: user 387 ms, sys: 124 ms, total: 511 ms<br>
Wall time: 2min 33s

In [41]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', MultinomialNB())
])

In [42]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.06731359 -1.07019065 -1.07019197]<br>
CPU times: user 377 ms, sys: 142 ms, total: 519 ms<br>
Wall time: 2min 29s

От къде е тая разлика? Може ли text2bow да чупи? Или е от липсата на хипер парамерти?

In [43]:
print(corpus)

MmCorpus(19579 documents, 22328 features, 240166 non-zero entries)


corpus е минат с word2vec; ще прескоча tfidf за момента и ще пусна LdaModel

In [44]:
from gensim.models.ldamodel import LdaModel

# lda = LdaModel(corpus=corpus, id2word=dictionary)
# lda.print_topic(0)

In [45]:
# id2word=dict([(i, s) for i, s in enumerate(dictionary)])
# print([ id2word[z] for z in range(10) ])
# print(list(id2word.items())[:10])
# what = list(enumerate(dictionary))
# print(what[:10])
# print(list(dictionary)[:10])
# print(type(dictionary))
# wdk = dictionary.keys()
# print(wdk[:10])
# print(dictionary[wdk[0]])

Горното даваше topic с референции към IDта вместо към реални думи - не знам защо сега работи

In [46]:
%%time
lda = LdaModel(corpus=corpus, id2word=dict(dictionary.items()))

  diff = np.log(self.expElogbeta)


CPU times: user 1min 44s, sys: 1.48 s, total: 1min 45s
Wall time: 1min 44s


Представянето на topic:

In [47]:
lda.print_topic(0)

'0.081*"said," + 0.071*"nature" + 0.041*"given" + 0.029*"silent" + 0.028*"gone" + 0.026*"visited" + 0.024*"winter" + 0.022*"determined" + 0.020*"sent" + 0.019*"could"'

In [48]:
lda.show_topics(num_topics=10, num_words=10)

[(47,
  '0.043*"words" + 0.038*"beheld" + 0.037*"trees" + 0.028*"endeavoured" + 0.026*"big" + 0.024*"rapidly" + 0.022*"water," + 0.022*"listened" + 0.019*"tone" + 0.019*"angles"'),
 (12,
  '0.052*"men" + 0.039*"large" + 0.032*"whilst" + 0.027*"one" + 0.025*"quiet" + 0.022*"almost" + 0.022*"things" + 0.017*"spring" + 0.017*"wandered" + 0.017*"supply"'),
 (29,
  '0.093*"place" + 0.051*"continued" + 0.041*"usual" + 0.041*"moment" + 0.041*"course," + 0.040*"cold" + 0.035*"easily" + 0.030*"effect" + 0.027*"time." + 0.025*"position"'),
 (78,
  '0.049*"dreams" + 0.045*"sound" + 0.039*"black" + 0.031*"arose" + 0.030*"which," + 0.029*"distant" + 0.028*"instant" + 0.025*"age" + 0.024*"wholly" + 0.022*"whose"'),
 (2,
  '0.091*"mind" + 0.075*"full" + 0.064*"less" + 0.053*"vast" + 0.049*"hope" + 0.042*"reached" + 0.042*"set" + 0.036*"sight" + 0.035*"forth" + 0.034*"portion"'),
 (48,
  '0.071*"him," + 0.070*"feel" + 0.059*"taken" + 0.031*"love," + 0.027*"carefully" + 0.026*"toward" + 0.020*"mind," +

Представянето на запис в латентното пространство. Векторите са 100, но Gensim показва само най-значимите:

In [49]:
lda[corpus[0]]

[(2, 0.046983603),
 (8, 0.057991676),
 (9, 0.091455139),
 (27, 0.056095801),
 (31, 0.48109618),
 (34, 0.055483889),
 (52, 0.041123584),
 (80, 0.070071869),
 (87, 0.054198224)]

In [50]:
lda.get_document_topics(corpus[1], minimum_probability=0.00001)

[(0, 0.0014285713),
 (1, 0.0014285713),
 (2, 0.0014285713),
 (3, 0.0014285713),
 (4, 0.0014285713),
 (5, 0.0014285713),
 (6, 0.0014285713),
 (7, 0.0014285713),
 (8, 0.0014285713),
 (9, 0.0014285713),
 (10, 0.0014285713),
 (11, 0.0014285713),
 (12, 0.0014285713),
 (13, 0.0014285713),
 (14, 0.0014285713),
 (15, 0.0014285713),
 (16, 0.0014285713),
 (17, 0.0014285713),
 (18, 0.0014285713),
 (19, 0.0014285713),
 (20, 0.0014285713),
 (21, 0.0014285713),
 (22, 0.0014285713),
 (23, 0.0014285713),
 (24, 0.0014285713),
 (25, 0.0014285713),
 (26, 0.0014285713),
 (27, 0.0014285713),
 (28, 0.0014285713),
 (29, 0.0014285713),
 (30, 0.0014285713),
 (31, 0.0014285713),
 (32, 0.0014285713),
 (33, 0.0014285713),
 (34, 0.0014285713),
 (35, 0.0014285713),
 (36, 0.0014285713),
 (37, 0.0014285713),
 (38, 0.0014285713),
 (39, 0.0014285713),
 (40, 0.0014285713),
 (41, 0.0014285713),
 (42, 0.0014285713),
 (43, 0.0014285713),
 (44, 0.0014285713),
 (45, 0.0014285713),
 (46, 0.0014285713),
 (47, 0.0014285713),
 (

In [51]:
print(len(corpus), len(train.author))

19579 19579


apply logistic regression, but first encode topics in dataframe

In [52]:
latently = train.copy()
for t in range(100):
    latently[t]=0

print(latently[list(range(100))])

         0   1   2   3   4   5   6   7   8   9  ...  90  91  92  93  94  95  \
id                                              ...                           
id26305   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id17569   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id11008   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id27763   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id12958   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
...      ..  ..  ..  ..  ..  ..  ..  ..  ..  .. ...  ..  ..  ..  ..  ..  ..   
id17718   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id08973   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id05267   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id17513   0   0   0   0   0   0   0   0   0   0 ...   0   0   0   0   0   0   
id00393   0   0   0   0   0   0   0   0   0   0 ... 

In [53]:
data = []
for i, doc in enumerate(corpus):
    temp = [0]*100
    for t in lda[doc]:
        temp[t[0]] = t[1]
    data.append(temp)

In [54]:
for t in range(100):
    latently[t] = [ data[z][t] for z in range(len(data)) ]

print(latently[list(range(100))])

          0         1         2    3         4      5    6    7         8   \
id                                                                           
id26305  0.0  0.000000  0.046983  0.0  0.000000  0.000  0.0  0.0  0.057969   
id17569  0.0  0.000000  0.000000  0.0  0.000000  0.000  0.0  0.0  0.000000   
id11008  0.0  0.000000  0.000000  0.0  0.000000  0.000  0.0  0.0  0.000000   
id27763  0.0  0.000000  0.000000  0.0  0.000000  0.000  0.0  0.0  0.000000   
id12958  0.0  0.746294  0.000000  0.0  0.000000  0.000  0.0  0.0  0.000000   
...      ...       ...       ...  ...       ...    ...  ...  ...       ...   
id17718  0.0  0.000000  0.000000  0.0  0.000000  0.000  0.0  0.0  0.000000   
id08973  0.0  0.000000  0.000000  0.0  0.336667  0.000  0.0  0.0  0.000000   
id05267  0.0  0.000000  0.000000  0.0  0.000000  0.000  0.0  0.0  0.000000   
id17513  0.0  0.000000  0.000000  0.0  0.000000  0.202  0.0  0.0  0.000000   
id00393  0.0  0.000000  0.000000  0.0  0.000000  0.000  0.0  0.0

In [55]:
AUTHOR_ENCODING = {'EAP': 0, 'MWS': 1, 'HPL': 2}
latently['encoded_author'] = [ AUTHOR_ENCODING[z] for z in train.author ]

In [56]:
latently.shape

(19579, 103)

In [57]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

X_train, X_test, y_train, y_test = train_test_split(latently[list(range(100))], latently.encoded_author, random_state=42)

In [58]:
logreg = LogisticRegression(C=0.01)

In [59]:
%%time
logreg.fit(X_train, y_train)

CPU times: user 77.2 ms, sys: 12.1 ms, total: 89.3 ms
Wall time: 88.1 ms


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:
print("Train score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Test  score: {:.2f}".format(logreg.score(X_test, y_test)))
prediction = logreg.predict_proba(X_test)
logloss = log_loss(y_test, prediction, labels=[0, 1, 2])
print('Log Loss score', logloss)

Train score: 0.41
Test  score: 0.40
Log Loss score 1.08282020218


In [61]:
print(prediction[:10])

[[ 0.40762581  0.29947673  0.29289746]
 [ 0.401099    0.31041542  0.28848558]
 [ 0.4100876   0.30027008  0.28964232]
 [ 0.41722321  0.28798965  0.29478714]
 [ 0.40455119  0.30231512  0.29313369]
 [ 0.40296533  0.30548759  0.29154708]
 [ 0.41068999  0.30681649  0.28249352]
 [ 0.39780867  0.31690806  0.28528327]
 [ 0.40133745  0.30606794  0.29259461]
 [ 0.40204636  0.30801855  0.28993509]]


In [62]:
print(X_test[:10])

              0         1         2         3    4         5         6    7   \
id                                                                             
id15695  0.12625  0.000000  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
id07954  0.00000  0.000000  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
id16303  0.00000  0.000000  0.000000  0.000000  0.0  0.043915  0.041232  0.0   
id07932  0.00000  0.000000  0.092798  0.000000  0.0  0.000000  0.000000  0.0   
id20875  0.00000  0.000000  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
id14743  0.00000  0.076833  0.000000  0.077333  0.0  0.000000  0.000000  0.0   
id07281  0.00000  0.000000  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
id09240  0.00000  0.252500  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
id23995  0.00000  0.000000  0.000000  0.000000  0.0  0.000000  0.000000  0.0   
id15141  0.50500  0.000000  0.000000  0.000000  0.0  0.000000  0.000000  0.0   

               8    9     ...         9

In [63]:
print([ list(z).index(max(z)) for z in prediction[:10] ], y_test[:10])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] id
id15695    0
id07954    1
id16303    1
id07932    0
id20875    2
id14743    1
id07281    0
id09240    0
id23995    0
id15141    0
Name: encoded_author, dtype: int64


Maybe we have way too many topics for this to work, let's see what the efect of only 3 topics is. I'll use the manual approach instead of Pipleline, since I don't know how data conversion is being handled.

In [66]:
%%time
lda3 = LdaModel(corpus=corpus, num_topics=3, id2word=dict(dictionary.items()))

CPU times: user 45 s, sys: 556 ms, total: 45.6 s
Wall time: 45.4 s


In [67]:
late3 = train.copy()

data = []
for i, doc in enumerate(corpus):
    temp = [0, 0, 0]
    for t in lda3[doc]:
        temp[t[0]] = t[1]
    data.append(temp)

late3[0] = [ data[z][0] for z in range(len(data)) ]
late3[1] = [ data[z][1] for z in range(len(data)) ]
late3[2] = [ data[z][2] for z in range(len(data)) ]

print(late3)

                                                      text author         0  \
id                                                                            
id26305  This process, however, afforded me no means of...    EAP  0.021857   
id17569  It never once occurred to me that the fumbling...    HPL  0.524008   
id11008  In his left hand was a gold snuff box, from wh...    EAP  0.019030   
id27763  How lovely is spring As we looked from Windsor...    MWS  0.963659   
id12958  Finding nothing else, not even gold, the Super...    HPL  0.138102   
...                                                    ...    ...       ...   
id17718  I could have fancied, while I looked at it, th...    EAP  0.048320   
id08973  The lids clenched themselves together as if in...    EAP  0.095075   
id05267  Mais il faut agir that is to say, a Frenchman ...    EAP  0.050181   
id17513  For an item of news like this, it strikes us i...    EAP  0.377603   
id00393  He laid a gnarled claw on my shoulder, and 

In [68]:
late3['encoded_author'] = [ AUTHOR_ENCODING[z] for z in train.author ]

In [71]:
X_train, X_test, y_train, y_test = train_test_split(late3[[0,1,2]], late3.encoded_author, random_state=42)

In [72]:
%%time
logreg.fit(X_train, y_train)

CPU times: user 43.7 ms, sys: 76 µs, total: 43.8 ms
Wall time: 43.6 ms


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
print("Train score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Test  score: {:.2f}".format(logreg.score(X_test, y_test)))
prediction = logreg.predict_proba(X_test)
logloss = log_loss(y_test, prediction, labels=[0, 1, 2])
print('Log Loss score', logloss)

Train score: 0.41
Test  score: 0.40
Log Loss score 1.08617640427


What if we dont use the LDA on it's own, but as extra features on top of the CountVectorizer? 

In [79]:
vectorizer = CountVectorizer(ngram_range=(1,2))

In [80]:
vectorized = vectorizer.fit_transform(train.text).todense()

MemoryError: 

Will use approach from http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html in another NB

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

In [64]:
fail here

SyntaxError: invalid syntax (<ipython-input-64-a2f04ec58aab>, line 1)

In [None]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Dropout, Lambda, Concatenate
from keras.optimizers import RMSprop, Adam
from keras import regularizers, objectives, metrics
from keras.models import Model
from keras import backend as K

from IPython.display import display

# Откри приблизително същите параметри, но не успя да стигне напълно до същия резултат.

Ще използвам следния модел:

TfIdf + MultinomialNB, без стеминг на текста.

Mean validation score: -0.423 (std: 0.003)

Ще ползвам и следните параметри:

Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}


Последна проверка на този модел за `LogLoss` и `Accuracy`

In [None]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

# Трениране на модел и събмит

Първо да видим в какъв формат трябва да се подадат резултатите за тест

In [None]:
sample_submission = pd.read_csv("data/spooky-authors/sample_submission.zip")
sample_submission.head()

In [None]:
pipeline = pipeline.fit(train.text, train.author)

In [None]:
print(pipeline.predict_proba(test[:10].text))

In [None]:
test_predictions = pipeline.predict_proba(test.text)

In [None]:
print(pipeline.classes_)

In [None]:
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.head(10)

In [None]:
submit_file.to_csv("data/spooky-authors/submit_Tfidf_MNB_text.csv")

Очакванията за събмита са да имаме скор някъде около 0.41 - 0.42.

Може да е малко по-добър защото при крос-валидацията тренирахме на 13к и тествахме 6к.

Сега трейн сета е целия: 19.5к

![submit-result.png](attachment:submit-result.png)

In [None]:
# Да хакнем ранкинга в кагъл?

print(test.text[:5].values)