In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

#%matplotlib notebook

import sklearn
import matplotlib as mpl
import seaborn as sns

import re

%matplotlib inline

pd.options.display.max_rows = 10

In [2]:
# Dataset

# use this if in fmi-hw... repo
# train = pd.read_csv("data/spooky/train.zip", index_col=['id'])
# test = pd.read_csv("data/spooky/test.zip", index_col=['id'])
# sample_submission = pd.read_csv("data/spooky/sample_submission.zip", index_col=['id'])

train = pd.read_csv("data/train.zip", index_col=['id'])
test = pd.read_csv("data/test.zip", index_col=['id'])
sample_submission = pd.read_csv("data/sample_submission.zip", index_col=['id'])


print(train.shape, test.shape, sample_submission.shape)
print(set(train.columns) - set(test.columns))

(19579, 2) (8392, 1) (8392, 3)
{'author'}


# ~~Първо - baseline модел~~

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import nltk
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [4]:
explore = train.copy()
stem = PorterStemmer()
explore['stemmed'] = explore.text.apply(lambda t: " ".join([stem.stem(w) for w in t.split()])) 
explore[['stemmed', 'text']].head()

Unnamed: 0_level_0,stemmed,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"thi process, however, afford me no mean of asc...","This process, however, afforded me no means of..."
id17569,It never onc occur to me that the fumbl might ...,It never once occurred to me that the fumbling...
id11008,"In hi left hand wa a gold snuff box, from whic...","In his left hand was a gold snuff box, from wh..."
id27763,how love is spring As we look from windsor ter...,How lovely is spring As we looked from Windsor...
id12958,"find noth else, not even gold, the superintend...","Finding nothing else, not even gold, the Super..."


# ~~Допълнителните фичъри не сработиха, стеминга също.~~

Остават да пробвам:

* ~~Оптимизиране на модела с CountVectorizer.~~
* Добавяне на още фичъри, от латентни пространства (LDA) - topic modeling.
* Word embeddings с невронни мрежи.
* Стакинг на класификатори.

За сега ще разгледаме само оптимизирането на модела.

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB


from gensim import corpora
from gensim import sklearn_api

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD

documents = train.text

The entire processing of the texts has to be automated via pipeline interface - tokenization, create dict, doc2bow/text2bow, ...

In [6]:
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]

In [7]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
              
            # Pipeline for gensim LDA
            ('gensim', Pipeline([
                ('bows', sklearn_api.text2bow.Text2BowTransformer()),
                ('tfidf', sklearn_api.tfidf.TfIdfTransformer()),
                ('lda', sklearn_api.ldamodel.LdaTransformer()),
#                 ('selector', ItemSelector(key='subject')),
#                 ('tfidf', TfidfVectorizer(min_df=50)),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('scikit', Pipeline([
                ('tfidf', TfidfVectorizer()),
#                 ('best', TruncatedSVD(n_components=50)),
            ])),

#             # Pipeline for pulling ad hoc features from stats
#             ('text_stats', Pipeline([
#                 ('stats', TextStats()),  # returns a list of dicts
#                 ('vect', DictVectorizer()),  # list of dicts -> feature matrix
#             ])),

        ],

#         # weight components in FeatureUnion
#         transformer_weights={
#             'gensim': 0.8,
#             'scikit': 0.5,
#             'text_stats': 1.0,
#         },
    )),
    ('clf', RandomForestClassifier())
])

In [8]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

| Office | Home |
|:------------- |:------------- |
| [ 0.59803922  0.59653693  0.59509579] | [ 0.58838848  0.58565737  0.58850575] |
| CPU times: user 329 ms, sys: 1.26 s, total: 1.59 s | CPU times: user 367 ms, sys: 608 ms, total: 975 ms |
| Wall time: 2min 40s | Wall time: 3min 22s |

In [9]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
            
            # Pipeline for gensim LDA
            ('gensim', Pipeline([
                ('bows', sklearn_api.text2bow.Text2BowTransformer()),
                ('tfidf', sklearn_api.tfidf.TfIdfTransformer()),
                ('lda', sklearn_api.ldamodel.LdaTransformer()),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('scikit', Pipeline([
                ('tfidf', TfidfVectorizer()),
#                 ('best', TruncatedSVD(n_components=50)),
            ])),

            # Pipeline for pulling ad hoc features from stats
            ('text_stats', Pipeline([
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

#         # weight components in FeatureUnion
#         transformer_weights={
#             'gensim': 0.8,
#             'scikit': 0.5,
#             'text_stats': 1.0,
#         },
    )),
    ('clf', MultinomialNB())
])

In [10]:
# %%time
# print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

| Home |
| ----- |
| [ 0.72840074  0.72019614  0.73042146] |
| CPU times: user 328 ms, sys: 120 ms, total: 448 ms |
| Wall time: 3min 13s |

Try weights

In [11]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [12]:
params_count_word = {"tfidf__ngram_range": [(1,1), (1,2), (1,3)],
                      "tfidf__analyzer": ['word'],
                      "tfidf__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "tfidf__min_df":[2, 3, 5, 10],
                      "tfidf__lowercase": [False, True],
                      "tfidf__stop_words": [None, stopwords]}

params_count_char = {"features__ngram_range": [(1,4), (1,5), (1,6)],
                      "features__analyzer": ['char'],
                      "features__max_df":[1.0, 0.9, 0.8, 0.7, 0.6, 0.5],
                      "features__min_df":[2, 3, 5, 10],
                      "features__lowercase": [False, True],
                      "features__stop_words": [None, stopwords]}

First check if we can pass params OK

In [20]:
def random_search():
#     params = {
#         "clf__C": [0.01, 0.1, 0.3, 1, 3, 10],
#         "clf__class_weight": [None, 'balanced']
#     }

    params = {
        "clf__alpha": [0.001, 0.005, 0.01, 0.05, 0.1, 0.3]
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[

                # Pipeline for gensim LDA
                ('gensim', Pipeline([
                    ('bows', sklearn_api.text2bow.Text2BowTransformer()),
                    ('apitfidf', sklearn_api.tfidf.TfIdfTransformer()),
                    ('lda', sklearn_api.ldamodel.LdaTransformer()),
                ])),

                # Pipeline for standard bag-of-words model for body
                ('scikit', Pipeline([
                    ('tfidf', TfidfVectorizer()),
    #                 ('best', TruncatedSVD(n_components=50)),
                ])),

                # Pipeline for pulling ad hoc features from stats
                ('text_stats', Pipeline([
                    ('stats', TextStats()),  # returns a list of dicts
                    ('vect', DictVectorizer()),  # list of dicts -> feature matrix
                ])),

            ],

            # weight components in FeatureUnion
            transformer_weights={
                'gensim': 0.8,
                'scikit': 0.5,
                'text_stats': 1.0,
            },
        )),
        ('clf', MultinomialNB())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
#                                        n_iter=20, cv=3, n_jobs=2)
                                       n_iter=20, cv=3, n_jobs=2)

    random_search.fit(train.text, train.author)
    report(random_search.cv_results_)

In [21]:
# %%time
# random_search()

  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)
  diff = np.log(self.expElogbeta)


Model with rank: 1
Mean validation score: -0.407 (std: 0.002)
Parameters: {'clf__alpha': 0.1}

Model with rank: 2
Mean validation score: -0.408 (std: 0.003)
Parameters: {'clf__alpha': 0.05}

Model with rank: 3
Mean validation score: -0.449 (std: 0.000)
Parameters: {'clf__alpha': 0.3}

Model with rank: 4
Mean validation score: -0.453 (std: 0.005)
Parameters: {'clf__alpha': 0.01}

Model with rank: 5
Mean validation score: -0.486 (std: 0.007)
Parameters: {'clf__alpha': 0.005}

CPU times: user 2min 49s, sys: 4.18 s, total: 2min 53s
Wall time: 27min 9s


Model with rank: 1
Mean validation score: -0.407 (std: 0.002)
Parameters: {'clf__alpha': 0.1}

Model with rank: 2
Mean validation score: -0.408 (std: 0.003)
Parameters: {'clf__alpha': 0.05}

Model with rank: 3
Mean validation score: -0.449 (std: 0.000)
Parameters: {'clf__alpha': 0.3}

Model with rank: 4
Mean validation score: -0.453 (std: 0.005)
Parameters: {'clf__alpha': 0.01}

Model with rank: 5
Mean validation score: -0.486 (std: 0.007)
Parameters: {'clf__alpha': 0.005}

CPU times: user 2min 49s, sys: 4.18 s, total: 2min 53s
Wall time: 27min 9s

Add tfidf params. Disable the union weights?
Also, do I have a book - n_iter=6 took almost 1/2 hour, so I expect the n_iter=20 to be about x4. Oh, make that another x10 because of trying with n-grams.

In [15]:
def random_search():
#     params = {
#         "clf__C": [0.01, 0.1, 0.3, 1, 3, 10],
#         "clf__class_weight": [None, 'balanced']
#     }

    params = {
        "clf__alpha": [0.03, 0.1, 0.3]
    }

    params.update(params_count_word)

    pipeline = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[

                # Pipeline for gensim LDA
                ('gensim', Pipeline([
                    ('bows', sklearn_api.text2bow.Text2BowTransformer()),
                    ('apitfidf', sklearn_api.tfidf.TfIdfTransformer()),
                    ('lda', sklearn_api.ldamodel.LdaTransformer()),
                ])),

                # Pipeline for standard bag-of-words model for body
                ('scikit', Pipeline([
                    ('tfidf', TfidfVectorizer()),
    #                 ('best', TruncatedSVD(n_components=50)),
                ])),

                # Pipeline for pulling ad hoc features from stats
                ('text_stats', Pipeline([
                    ('stats', TextStats()),  # returns a list of dicts
                    ('vect', DictVectorizer()),  # list of dicts -> feature matrix
                ])),

            ],

            # weight components in FeatureUnion
            transformer_weights={
                'gensim': 0.8,
                'scikit': 0.5,
                'text_stats': 1.0,
            },
        )),
        ('clf', MultinomialNB())
    ])

    random_search = RandomizedSearchCV(pipeline, param_distributions=params, 
                                       scoring='neg_log_loss',
                                       n_iter=20, cv=3, n_jobs=2)

    random_search.fit(train.text, train.author)
    report(random_search.cv_results_)

In [16]:
%%time
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))

[ 0.62699142  0.62365921  0.62758621]<br>
CPU times: user 349 ms, sys: 102 ms, total: 451 ms<br>
Wall time: 3min 12s

In [17]:
FAIL HERE FOR NOW

SyntaxError: invalid syntax (<ipython-input-17-386d78a74f2d>, line 1)

# Откри приблизително същите параметри, но не успя да стигне напълно до същия резултат.

Ще използвам следния модел:

TfIdf + MultinomialNB, без стеминг на текста.

Mean validation score: -0.423 (std: 0.003)

Ще ползвам и следните параметри:

Parameters: {'features__stop_words': None, 'features__ngram_range': (1, 2), 'features__min_df': 2, 'features__max_df': 0.8, 'features__lowercase': False, 'features__analyzer': 'word', 'clf__alpha': 0.01}


Последна проверка на този модел за `LogLoss` и `Accuracy`

In [None]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, 2), min_df=2,
                                 max_df=0.8, lowercase=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3))
print(cross_val_score(pipeline, train.text, train.author, cv=3, n_jobs=3, 
                      scoring='neg_log_loss'))

# Трениране на модел и събмит

Първо да видим в какъв формат трябва да се подадат резултатите за тест

In [None]:
sample_submission = pd.read_csv("data/spooky-authors/sample_submission.zip")
sample_submission.head()

In [None]:
pipeline = pipeline.fit(train.text, train.author)

In [None]:
print(pipeline.predict_proba(test[:10].text))

In [None]:
test_predictions = pipeline.predict_proba(test.text)

In [None]:
print(pipeline.classes_)

In [None]:
submit_file = pd.DataFrame(test_predictions, columns=['EAP', 'MWS', 'HPL'], index=test.index)
submit_file.head(10)

In [None]:
submit_file.to_csv("data/spooky-authors/submit_Tfidf_MNB_text.csv")

Очакванията за събмита са да имаме скор някъде около 0.41 - 0.42.

Може да е малко по-добър защото при крос-валидацията тренирахме на 13к и тествахме 6к.

Сега трейн сета е целия: 19.5к

![submit-result.png](attachment:submit-result.png)

In [None]:
# Да хакнем ранкинга в кагъл?

print(test.text[:5].values)