In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

#%matplotlib notebook

import sklearn
import matplotlib as mpl
import seaborn as sns

import re

%matplotlib inline

pd.options.display.max_rows = 10

In [2]:
# Dataset

# use this if in fmi-hw... repo
# train = pd.read_csv("data/spooky/train.zip", index_col=['id'])
# test = pd.read_csv("data/spooky/test.zip", index_col=['id'])
# sample_submission = pd.read_csv("data/spooky/sample_submission.zip", index_col=['id'])

train = pd.read_csv("data/train.zip", index_col=['id'])
test = pd.read_csv("data/test.zip", index_col=['id'])
sample_submission = pd.read_csv("data/sample_submission.zip", index_col=['id'])


print(train.shape, test.shape, sample_submission.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1) (8392, 3)
{'author'}


# ~~Първо - baseline модел~~

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import nltk
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [4]:
explore = train.copy()
stem = PorterStemmer()
explore['stemmed'] = explore.text.apply(lambda t: " ".join([stem.stem(w) for w in t.split()])) 
explore[['stemmed', 'text']].head()

Unnamed: 0_level_0,stemmed,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"thi process, however, afford me no mean of asc...","This process, however, afforded me no means of..."
id17569,It never onc occur to me that the fumbl might ...,It never once occurred to me that the fumbling...
id11008,"In hi left hand wa a gold snuff box, from whic...","In his left hand was a gold snuff box, from wh..."
id27763,how love is spring As we look from windsor ter...,How lovely is spring As we looked from Windsor...
id12958,"find noth else, not even gold, the superintend...","Finding nothing else, not even gold, the Super..."


# ~~Допълнителните фичъри не сработиха, стеминга също.~~

Остават да пробвам:

* ~~Оптимизиране на модела с CountVectorizer.~~
* Добавяне на още фичъри, от латентни пространства (LDA) - topic modeling.
* Word embeddings с невронни мрежи.
* Стакинг на класификатори.

За сега ще разгледаме само оптимизирането на модела.

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB


from gensim import corpora
from gensim import sklearn_api

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD

documents = train.text

The entire processing of the texts has to be automated via pipeline interface - tokenization, create dict, doc2bow/text2bow, ...

In [6]:
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]

In [7]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
              
            # Pipeline for gensim LDA
            ('gensim', Pipeline([
                ('bows', sklearn_api.text2bow.Text2BowTransformer()),
                ('tfidf', sklearn_api.tfidf.TfIdfTransformer()),
                ('lda', sklearn_api.ldamodel.LdaTransformer()),
#                 ('selector', ItemSelector(key='subject')),
#                 ('tfidf', TfidfVectorizer(min_df=50)),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('scikit', Pipeline([
                ('tfidf', TfidfVectorizer()),
#                 ('best', TruncatedSVD(n_components=50)),
            ])),

#             # Pipeline for pulling ad hoc features from stats
#             ('text_stats', Pipeline([
#                 ('stats', TextStats()),  # returns a list of dicts
#                 ('vect', DictVectorizer()),  # list of dicts -> feature matrix
#             ])),

        ],

#         # weight components in FeatureUnion
#         transformer_weights={
#             'gensim': 0.8,
#             'scikit': 0.5,
#             'text_stats': 1.0,
#         },
    )),
    ('clf', RandomForestClassifier())
])

In [8]:
%%time
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)


[ 0.5791973   0.57906834  0.59279693]
CPU times: user 233 ms, sys: 105 ms, total: 338 ms
Wall time: 2min 28s


[ 0.59803922  0.59653693  0.59509579]<br>
CPU times: user 329 ms, sys: 1.26 s, total: 1.59 s<br>
Wall time: 2min 40s

In [13]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
                    
            
            ('features', TfidfVectorizer()),
            
            # Pipeline for gensim LDA
            ('gensim', Pipeline([
                ('bows', sklearn_api.text2bow.Text2BowTransformer()),
                ('tfidf', sklearn_api.tfidf.TfIdfTransformer()),
                ('lda', sklearn_api.ldamodel.LdaTransformer()),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('scikit', Pipeline([
                ('tfidf', TfidfVectorizer()),
#                 ('best', TruncatedSVD(n_components=50)),
            ])),

            # Pipeline for pulling ad hoc features from stats
            ('text_stats', Pipeline([
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

#         # weight components in FeatureUnion
#         transformer_weights={
#             'gensim': 0.8,
#             'scikit': 0.5,
#             'text_stats': 1.0,
#         },
    )),
    ('clf', MultinomialNB())
])

In [14]:
%%time
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)


[ 0.72717525  0.71835734  0.72904215]
CPU times: user 246 ms, sys: 126 ms, total: 372 ms
Wall time: 2min 24s


In [None]:
FAIL

In [None]:
bows = sklearn_api.text2bow.Text2BowTransformer().fit_transform(train.text)

In [None]:
print(train.text[656], bows[656])

In [None]:
lda = sklearn_api.ldamodel.LdaTransformer().fit_transform(bows)

In [None]:
print(train.text[656], bows[656], lda[bows[656]])

In [None]:
lda

In [None]:
fail here

for now

[ 0.48360907  0.48850751  0.48613027]<br>
[-2.1033608  -2.09791398 -2.30065565]

Преди да пообработим текста да видим Logistic Regression

In [None]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', LogisticRegression())
])


In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

[ 0.53722426  0.53463071  0.53563218]<br>
CPU times: user 430 ms, sys: 169 ms, total: 599 ms<br>
Wall time: 3min 41s

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.96678405 -0.97080789 -0.97363342]<br>
CPU times: user 453 ms, sys: 121 ms, total: 573 ms<br>
Wall time: 3min 39s

Още един опит преди да окастрим текста

In [None]:
pipeline = Pipeline([
    ('features', sklearn_api.text2bow.Text2BowTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer(num_topics=21)),
    ('clf', LogisticRegression())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.98861154 -0.97565475 -0.99821168]<br>
CPU times: user 381 ms, sys: 99.9 ms, total: 481 ms<br>
Wall time: 2min 8s

In [None]:
explore.stemmed

In [None]:
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords

# CUSTOM_FILTERS = [strip_punctuation, remove_stopwords]
# explore['stemmed2'] = [ preprocess_string(s, CUSTOM_FILTERS) for s in explore.stemmed]
# explore.stemmed2

explore['stemmed2'] = [ strip_punctuation(remove_stopwords(s.lower())) for s in explore.stemmed]
explore.stemmed2

In [None]:
print(explore.loc['id17569'].stemmed, '|||', explore.loc['id17569'].stemmed2)

Pipeline-a си е същия

In [None]:
# %%time
# print(cross_val_score(pipeline, explore.stemmed2, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.05550683 -1.02313271 -1.02899542]<br>
CPU times: user 347 ms, sys: 92.8 ms, total: 440 ms<br>
Wall time: 1min 27s

Поне е по-бързо. Да видим tfidf и повече фичъри.

In [None]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer(num_topics=501)),
    ('clf', LogisticRegression())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-0.94941507 -0.95248997 -0.95026712]<br>
CPU times: user 1.18 s, sys: 223 ms, total: 1.4 s<br>
Wall time: 19min 4s

This is frustrating

In [None]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', LogisticRegression())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.03061917 -1.03984701 -1.03884094]<br>
CPU times: user 387 ms, sys: 124 ms, total: 511 ms<br>
Wall time: 2min 33s

In [None]:
pipeline = Pipeline([
    ('bow', sklearn_api.text2bow.Text2BowTransformer()),
    ('features', sklearn_api.tfidf.TfIdfTransformer()),
    ('lda', sklearn_api.ldamodel.LdaTransformer()),
    ('clf', MultinomialNB())
])

In [None]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, 
#                 cv=3, n_jobs=3, scoring='neg_log_loss'))

[-1.06731359 -1.07019065 -1.07019197]<br>
CPU times: user 377 ms, sys: 142 ms, total: 519 ms<br>
Wall time: 2min 29s

От къде е тая разлика? Може ли text2bow да чупи? Или е от липсата на хипер парамерти?

In [None]:
print(corpus)

corpus е минат с word2vec; ще прескоча tfidf за момента и ще пусна LdaModel

In [None]:
from gensim.models.ldamodel import LdaModel

# lda = LdaModel(corpus=corpus, id2word=dictionary)
# lda.print_topic(0)

In [None]:
# id2word=dict([(i, s) for i, s in enumerate(dictionary)])
# print([ id2word[z] for z in range(10) ])
# print(list(id2word.items())[:10])
# what = list(enumerate(dictionary))
# print(what[:10])
# print(list(dictionary)[:10])
# print(type(dictionary))
# wdk = dictionary.keys()
# print(wdk[:10])
# print(dictionary[wdk[0]])

Горното даваше topic с референции към IDта вместо към реални думи - не знам защо сега работи

In [None]:
%%time
lda = LdaModel(corpus=corpus, id2word=dict(dictionary.items()))

Представянето на topic:

In [None]:
lda.print_topic(0)

In [None]:
lda.show_topics(num_topics=10, num_words=10)

Представянето на запис в латентното пространство. Векторите са 100, но Gensim показва само най-значимите:

In [None]:
lda[corpus[0]]

In [None]:
lda.get_document_topics(corpus[1], minimum_probability=0.00001)

In [None]:
print(len(corpus), len(train.author))

apply logistic regression, but first encode topics in dataframe

In [None]:
latently = train.copy()
for t in range(100):
    latently[t]=0

print(latently[list(range(100))])

In [None]:
data = []
for i, doc in enumerate(corpus):
    temp = [0]*100
    for t in lda[doc]:
        temp[t[0]] = t[1]
    data.append(temp)

In [None]:
for t in range(100):
    latently[t] = [ data[z][t] for z in range(len(data)) ]

print(latently[list(range(100))])

In [None]:
AUTHOR_ENCODING = {'EAP': 0, 'MWS': 1, 'HPL': 2}
latently['encoded_author'] = [ AUTHOR_ENCODING[z] for z in train.author ]

In [None]:
latently.shape

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

X_train, X_test, y_train, y_test = train_test_split(latently[list(range(100))], latently.encoded_author, random_state=42)

In [None]:
logreg = LogisticRegression(C=0.01)

In [None]:
%%time
logreg.fit(X_train, y_train)

In [None]:
print("Train score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Test  score: {:.2f}".format(logreg.score(X_test, y_test)))
prediction = logreg.predict_proba(X_test)
logloss = log_loss(y_test, prediction, labels=[0, 1, 2])
print('Log Loss score', logloss)

In [None]:
print(prediction[:10])

In [None]:
print(X_test[:10])

In [None]:
print([ list(z).index(max(z)) for z in prediction[:10] ], y_test[:10])

Maybe we have way too many topics for this to work, let's see what the efect of only 3 topics is. I'll use the manual approach instead of Pipleline, since I don't know how data conversion is being handled.

In [None]:
%%time
lda3 = LdaModel(corpus=corpus, num_topics=3, id2word=dict(dictionary.items()))

In [None]:
late3 = train.copy()

data = []
for i, doc in enumerate(corpus):
    temp = [0, 0, 0]
    for t in lda3[doc]:
        temp[t[0]] = t[1]
    data.append(temp)

late3[0] = [ data[z][0] for z in range(len(data)) ]
late3[1] = [ data[z][1] for z in range(len(data)) ]
late3[2] = [ data[z][2] for z in range(len(data)) ]

print(late3)

In [None]:
late3['encoded_author'] = [ AUTHOR_ENCODING[z] for z in train.author ]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(late3[[0,1,2]], late3.encoded_author, random_state=42)

In [None]:
%%time
logreg.fit(X_train, y_train)

In [None]:
print("Train score: {:.2f}".format(logreg.score(X_train, y_train)))
print("Test  score: {:.2f}".format(logreg.score(X_test, y_test)))
prediction = logreg.predict_proba(X_test)
logloss = log_loss(y_test, prediction, labels=[0, 1, 2])
print('Log Loss score', logloss)

What if we dont use the LDA on it's own, but as extra features on top of the CountVectorizer? 

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2))

In [None]:
vectorized = vectorizer.fit_transform(train.text).todense()

Will use approach from http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html in another NB

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

-----------------------------------------------

In [None]:
fail here

In [None]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Dropout, Lambda, Concatenate
from keras.optimizers import RMSprop, Adam
from keras import regularizers, objectives, metrics
from keras.models import Model
from keras import backend as K

from IPython.display import display

# Откри приблизително същите параметри, но не успя да стигне напълно до същия резултат.

Ще използвам следния модел:

TfIdf + MultinomialNB, без стеминг на текста.

Mean validation score: -0.423 (std: 0.003)

Ще ползвам и следните параметри:

Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}


Последна проверка на този модел за `LogLoss` и `Accuracy`

In [None]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

# Трениране на модел и събмит

Първо да видим в какъв формат трябва да се подадат резултатите за тест

In [None]:
sample_submission = pd.read_csv("data/spooky-authors/sample_submission.zip")
sample_submission.head()

In [None]:
pipeline = pipeline.fit(train.text, train.author)

In [None]:
print(pipeline.predict_proba(test[:10].text))

In [None]:
test_predictions = pipeline.predict_proba(test.text)

In [None]:
print(pipeline.classes_)

In [None]:
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.head(10)

In [None]:
submit_file.to_csv("data/spooky-authors/submit_Tfidf_MNB_text.csv")

Очакванията за събмита са да имаме скор някъде около 0.41 - 0.42.

Може да е малко по-добър защото при крос-валидацията тренирахме на 13к и тествахме 6к.

Сега трейн сета е целия: 19.5к

![submit-result.png](attachment:submit-result.png)

In [None]:
# Да хакнем ранкинга в кагъл?

print(test.text[:5].values)