In [8]:
import os
import re
import sys
import numpy as np
from scipy.sparse import hstack
import time
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
import lightgbm as lgb
import wordbatch
from wordbatch.extractors import WordBag, WordHash
from nltk.corpus import stopwords
import pickle as pkl
import gzip

In [9]:
stemmer = SnowballStemmer("english")

# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2)))

def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])


In [10]:
train = pd.read_csv('./data/kaggle/train.csv')
test = pd.read_csv('./data/kaggle/test.csv')

train_size = train.shape[0]
y = train['label']
test_ids = test['id']
test_size = test.shape[0]

In [11]:
train.drop('label', axis=1, inplace=True)

In [12]:
df = train.append(test)
df.shape

(26000, 4)

In [13]:
df.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...


In [14]:
# fill nans
print("Filling NaNs")
df['author'].fillna('No author', inplace=True)
df['title'].fillna('No title', inplace=True)
df['text'].fillna('No text', inplace=True)

Filling NaNs


In [15]:
print("Start Encoding Labels for Author")
le = LabelEncoder()
df['author_cat'] = le.fit_transform(df['author'])
df.head()

Start Encoding Labels for Author


Unnamed: 0,id,title,author,text,author_cat
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1062
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,1026
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",940
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,2020
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1701


In [16]:
print("Start Stemming Title")
df['stemmed_title'] = df['title'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

print("Start Stemming News Text")
df['stemmed_text'] = df['text'].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

Start Stemming Title
Start Stemming News Text


In [17]:
authors_df = df[['author', 'author_cat']]
authors_df.head()

Unnamed: 0,author,author_cat
0,Darrell Lucus,1062
1,Daniel J. Flynn,1026
2,Consortiumnews.com,940
3,Jessica Purkiss,2020
4,Howard Portnoy,1701


In [19]:
authors_df.to_csv('./data/author_cat_kaggle.csv', index=False)

In [21]:
# drop the title autor and text
df.drop(['id', 'title', 'author', 'text'], axis=1, inplace=True)
df.head()

KeyError: "['title' 'author' 'text'] not found in axis"

In [23]:
df.drop(['id'], axis=1, inplace=True)

In [24]:
wb = wordbatch.WordBatch(normalize_text
                         , extractor=(WordBag, {"hash_ngrams": 2,
                                                "hash_ngrams_weights": [0.5, -1.0],
                                                "hash_size": 2 ** 23,
                                                "norm": 'l2',
                                                "tf": 'log',
                                                "idf": 50.0}
                                     )
                         , procs=8
                         , method="serial")

wb.dictionary_freeze = True

In [26]:
X_title = wb.transform(df['stemmed_title'])
X_text = wb.transform(df['stemmed_text'])
X_author = df['author_cat'].values
X_author = X_author.reshape(-1, 1)
sparse_merge = hstack((X_title, X_text, X_author)).tocsr()

Normalize text
Extract wordbags
Normalize text
Extract wordbags


In [27]:
sparse_merge.shape

(26000, 16777217)

In [28]:
X = sparse_merge[:train_size]
X_test = sparse_merge[train_size:]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)
d_train = lgb.Dataset(train_X, label=train_y)
d_valid = lgb.Dataset(valid_X, label=valid_y)
watchlist = [d_train, d_valid]

In [29]:
params = {
        'objective': 'binary',
        'learning_rate': 0.4,
        'num_leaves': 31,
        'feature_fraction': 0.64,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'boosting_type': 'gbdt',
        'metric': 'binary_logloss'
    }

model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist,
                      early_stopping_rounds=1000, verbose_eval=1)

[1]	training's binary_logloss: 0.403742	valid_1's binary_logloss: 0.409301
Training until validation scores don't improve for 1000 rounds.
[2]	training's binary_logloss: 0.266459	valid_1's binary_logloss: 0.270318
[3]	training's binary_logloss: 0.197907	valid_1's binary_logloss: 0.20503
[4]	training's binary_logloss: 0.152899	valid_1's binary_logloss: 0.162766
[5]	training's binary_logloss: 0.119917	valid_1's binary_logloss: 0.126612
[6]	training's binary_logloss: 0.0933714	valid_1's binary_logloss: 0.0968885
[7]	training's binary_logloss: 0.0759055	valid_1's binary_logloss: 0.0814135
[8]	training's binary_logloss: 0.0619906	valid_1's binary_logloss: 0.0690455
[9]	training's binary_logloss: 0.0522674	valid_1's binary_logloss: 0.0589304
[10]	training's binary_logloss: 0.0445224	valid_1's binary_logloss: 0.0511324
[11]	training's binary_logloss: 0.0388735	valid_1's binary_logloss: 0.0490104
[12]	training's binary_logloss: 0.0329797	valid_1's binary_logloss: 0.0439316
[13]	training's bina

[110]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[111]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[112]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[113]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[114]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[115]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[116]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[117]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[118]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[119]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[120]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[121]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[122]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[123]	training's binary_l

[233]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[234]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[235]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[236]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[237]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[238]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[239]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[240]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[241]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[242]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[243]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[244]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[245]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[246]	training's binary_l

[354]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[355]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[356]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[357]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[358]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[359]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[360]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[361]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[362]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[363]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[364]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[365]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[366]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[367]	training's binary_l

[464]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[465]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[466]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[467]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[468]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[469]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[470]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[471]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[472]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[473]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[474]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[475]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[476]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[477]	training's binary_l

[575]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[576]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[577]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[578]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[579]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[580]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[581]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[582]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[583]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[584]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[585]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[586]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[587]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[588]	training's binary_l

[688]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[689]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[690]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[691]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[692]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[693]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[694]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[695]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[696]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[697]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[698]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[699]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[700]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[701]	training's binary_l

[810]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[811]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[812]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[813]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[814]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[815]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[816]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[817]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[818]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[819]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[820]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[821]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[822]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[823]	training's binary_l

[925]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[926]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[927]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[928]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[929]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[930]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[931]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[932]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[933]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[934]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[935]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[936]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[937]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[938]	training's binary_l

[1046]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[1047]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[1048]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[1049]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[1050]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
[1051]	training's binary_logloss: 13.7268	valid_1's binary_logloss: 13.4834
Early stopping, best iteration is:
[51]	training's binary_logloss: 0.000556754	valid_1's binary_logloss: 0.00988581


In [30]:
preds = model.predict(valid_X)
print("LGB dev f1_score:", f1_score(valid_y, np.round(preds)))
print("LGB dev accuracy_score:", accuracy_score(valid_y, np.round(preds)))
print("LGB dev recall_score:", recall_score(valid_y, np.round(preds)))
print("LGB dev precision_score:", precision_score(valid_y, np.round(preds)))

LGB dev f1_score: 0.9961013645224172
LGB dev accuracy_score: 0.9961538461538462
LGB dev recall_score: 0.998046875
LGB dev precision_score: 0.9941634241245136


In [31]:
print('Saving model...')
# save model to file
if not os.path.exists('data/models/'):
    os.makedirs('data/models/')

model.save_model('data/models/lgb_model_best_iter_2.txt', num_iteration=model.best_iteration)
print('Model Saved')

Saving model...
Model Saved


In [32]:
print("Saving wb model")
with open('data/models/wb_transform.pkl', 'wb') as model_file:
    pkl.dump(wb, model_file, protocol=2)
print('Model Saved')

Saving wb model
Model Saved


In [33]:
y_hat = model.predict(X_test)
pred = pd.DataFrame(np.round(y_hat).astype(int), columns=['label'])
pred['id'] = test_ids
pred.to_csv('data/kaggle/submission_lgb_simple_2.csv', index=False)
print('Kaggle Submission Saved to data/kaggle/')

Kaggle Submission Saved to data/kaggle/


# now lets build a simple example to test a client version

In [27]:
gbm = lgb.Booster(model_file='./data/models/lgb_model_best_iter.txt')

print("Loading models")
pickle_model = "./data/models/wb_transform.pkl"
wb_t = pkl.load(open(pickle_model, 'rb'))

Loading models


In [28]:
gbm

<lightgbm.basic.Booster at 0x1f0615aeb38>

In [29]:
wb_t

<wordbatch.wordbatch.WordBatch at 0x1f0615ae978>

In [43]:
#author_cat	stemmed_title	stemmed_text
d = {'stemmed_title': train['title'][1],
     'stemmed_text': train['text'][1],
     'author_cat': 1026}

In [44]:
X_test_title = wb.transform([d['stemmed_title']])
X_test_title.shape

Normalize text
Extract wordbags


(1, 8388608)

In [45]:
X_test_text = wb.transform([d['stemmed_text']])
X_test_text.shape

Normalize text
Extract wordbags


(1, 8388608)

In [46]:
X_test_author = np.array(d['author_cat'])
X_test_author = X_test_author.reshape(-1, 1)
X_test_author.shape

(1, 1)

In [47]:
sparse_merge_test = hstack((X_test_title, X_test_text, X_test_author)).tocsr()
sparse_merge_test.shape

(1, 16777217)

In [48]:
pred = gbm.predict(sparse_merge_test)
pred

array([0.99802852])