In [34]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from preprocess_input import preprocess
import pickle as pkl
import gzip
import re
from nltk.corpus import stopwords
import lightgbm as gbm

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [35]:
train = pd.read_csv('./data/kaggle/train.csv')

In [36]:
# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [37]:
print("Loading models")
pickle_model = "./data/models/models_ftrl.pkl"
wb, clf = pkl.load(gzip.open(pickle_model, 'rb'))

Loading models


In [38]:
wb

<wordbatch.wordbatch.WordBatch at 0x2410603fe80>

In [39]:
clf

<wordbatch.models.ftrl.FTRL at 0x24113d611b0>

In [64]:
train[train['label'] == 0].head(10)

Unnamed: 0,id,title,author,text,label
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0
10,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that...",0
11,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...,0
15,15,"In Major League Soccer, Argentines Find a Home...",Jack Williams,Guillermo Barros Schelotto was not the first A...,0
16,16,Wells Fargo Chief Abruptly Steps Down - The Ne...,Michael Corkery and Stacy Cowley,The scandal engulfing Wells Fargo toppled its ...,0
19,19,Chuck Todd: ’BuzzFeed Did Donald Trump a Polit...,Jeff Poor,Wednesday after Donald Trump’s press confere...,0


In [41]:
d = {'title': train['title'][1],
     'text': train['text'][1],
     'author': train['author'][1]}
# df_test = pd.DataFrame(data=d)
# df_test.head()
title_stemmed = [' '.join([stemmer.stem(y) for y in d['title'].split(' ')])]
text_stemmed = [' '.join([stemmer.stem(y) for y in d['text'].split(' ')])]

In [42]:
title_stemmed

['flynn: hillari clinton, big woman on campus - breitbart']

In [43]:
X_title = wb.transform(title_stemmed)
X_title.shape

Normalize text
Extract wordbags


(1, 8388608)

In [44]:
X_text = wb.transform(text_stemmed)
X_text.shape

Normalize text
Extract wordbags


(1, 8388608)

In [45]:
X_author = np.array([2114])
X_author = X_author.reshape(-1, 1)
X_author.shape

(1, 1)

In [46]:
sparse_merge = hstack((X_title, X_text, X_author)).tocsr()
sparse_merge.shape

(1, 16777217)

In [47]:
y = clf.predict(sparse_merge)

In [48]:
y

array([0.99998833])

In [65]:
test = pd.read_csv('./data/kaggle/test.csv')
test['author'].fillna('No author', inplace=True)
test['title'].fillna('No title', inplace=True)
test['text'].fillna('No text', inplace=True)

In [66]:
test.tail()

Unnamed: 0,id,title,author,text
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,No author,« Previous - Next » 300 US Marines To Be Deplo...
5199,25999,"Awkward Sex, Onscreen and Off - The New York T...",Teddy Wayne,Perhaps you’ve seen the new TV series whose pi...


In [67]:
test.loc[len(test)]=[26000, train['title'][1],train['author'][1],train['text'][1]] 
test.loc[len(test)]=[26001, train['title'][5],train['author'][5],train['text'][5]] 
test.loc[len(test)]=[26002, train['title'][7],train['author'][7],train['text'][7]] 
test.loc[len(test)]=[26003, train['title'][8],train['author'][8],train['text'][8]] 
test.tail()

Unnamed: 0,id,title,author,text
5199,25999,"Awkward Sex, Onscreen and Off - The New York T...",Teddy Wayne,Perhaps you’ve seen the new TV series whose pi...
5200,26000,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
5201,26001,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi..."
5202,26002,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi..."
5203,26003,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...


In [68]:
test_ids = test['id']
test_size = test.shape[0]
X_test_text = wb.transform(test['text'])
y_hat = clf.predict(X_test_text)
pred = pd.DataFrame(np.round(y_hat).astype(int), columns=['label'])
pred['id'] = test_ids
pred['label'].value_counts()

Normalize text
Extract wordbags


1    3098
0    2106
Name: label, dtype: int64

In [69]:
pred.tail()

Unnamed: 0,label,id
5199,0,25999
5200,1,26000
5201,1,26001
5202,1,26002
5203,1,26003


In [33]:
lgb = gbm.Booster(model_file='./data/models/lgb_model_best_iter.txt')
y_hat2 = lgb.predict(X_test_text)
pred2 = pd.DataFrame(np.round(y_hat2).astype(int), columns=['label'])
pred2['id'] = test_ids
pred2['label'].value_counts()

1    5200
Name: label, dtype: int64