In [32]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from preprocess_input import preprocess
import pickle as pkl
import gzip
import re
from nltk.corpus import stopwords
import lightgbm as gbm

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [33]:
train = pd.read_csv('./data/kaggle/train.csv')

In [34]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


In [35]:
train['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [36]:
# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [37]:
print("Loading models")
pickle_model = "./data/models/wb_model.pkl"
wb = pkl.load(gzip.open(pickle_model, 'rb'))
wb
# wb, clf= pkl.load(gzip.open(pickle_model, 'rb'))

Loading models


<wordbatch.wordbatch.WordBatch at 0x221840602e8>

In [38]:
print("Preprocessing")
# d = {'title': "Title of the news",
#      'text': "Long before Facebook, Twitter or even Google existed, the fact checking website Snopes.com was running down the half-truths, misinformation, and outright lies that ricochet across the Internet. Today it remains a widely respected clearinghouse of all things factual and not.",
#      'author': "Someone from Somewhere"}

d = {'title': train['title'][0],
     'text': train['text'][0],
     'author': train['author'][0]}
# df_test = pd.DataFrame(data=d)
# df_test.head()
title_stemmed = [' '.join([stemmer.stem(y) for y in d['title'].split(' ')])]
text_stemmed = [' '.join([stemmer.stem(y) for y in d['text'].split(' ')])]

Preprocessing


In [39]:
X_title = wb.transform(title_stemmed)
# X_title = X_title[:, np.array(np.clip(X_title.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
X_title.shape

Normalize text
Extract wordbags


(1, 8388608)

In [40]:
X_text = wb.transform(text_stemmed)
#X_text = X_text[:, np.array(np.clip(X_text.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
X_text.shape

Normalize text
Extract wordbags


(1, 8388608)

In [41]:
#2191 fake
X_author = np.array([2114])
X_author = X_author.reshape(-1, 1)
X_author.shape

(1, 1)

In [42]:
sparse_merge = hstack((X_title, X_text, X_author)).tocsr()
sparse_merge.shape

(1, 16777217)

In [43]:
sparse_merge.data.shape

(710,)

In [44]:
sparse_merge.data.mean()

2.9767294722996276

In [70]:
lgb = gbm.Booster(model_file='./data/models/lgb_model_best_iter.txt')

In [71]:
y = lgb.predict(X_title)

In [72]:
y

array([0.99999365])

In [49]:
print("The score of truth from 0 to 1 for this news is {}".format(np.round(y, 5)))

The score of truth from 0 to 1 for this news is [0.99999]


In [51]:
print("Loading models")
pickle_model = "./data/models/models_ftrl.pkl"
wb, clf = pkl.load(gzip.open(pickle_model, 'rb'))

Loading models


In [54]:
test = pd.read_csv('./data/kaggle/test.csv')
test_ids = test['id']
test_size = test.shape[0]
test['author'].fillna('No author', inplace=True)
test['title'].fillna('No title', inplace=True)
test['text'].fillna('No text', inplace=True)

In [65]:
X_test_title = wb.transform(train['title'][:15])
X_test_text = wb.transform(train['text'][:15])
sparse_merge = hstack((X_test_title, X_test_text)).tocsr()
sparse_merge.shape

Normalize text
Extract wordbags
Normalize text
Extract wordbags


(15, 16777216)

In [66]:
y_hat = clf.predict(sparse_merge)
print(y_hat)
pred = pd.DataFrame(np.round(y_hat).astype(int), columns=['label'])
pred['id'] = test_ids
pred['label'].value_counts()

[9.27888230e-01 7.69142809e-01 9.99257483e-01 1.62336802e-02
 7.07578701e-02 1.13140920e-03 2.49696816e-04 6.30511676e-16
 6.02630253e-08 2.33416888e-14 4.20174617e-03 1.21747635e-02
 7.25265554e-01 1.00000000e+00 4.49430405e-01]


0    10
1     5
Name: label, dtype: int64

In [67]:
train['label'][:15]

0     1
1     0
2     1
3     1
4     1
5     0
6     1
7     0
8     0
9     0
10    0
11    0
12    1
13    1
14    1
Name: label, dtype: int64

In [69]:
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
print("FTRL dev accuracy_score:", accuracy_score(train['label'][:15].values, np.round(pred['label'])))

FTRL dev accuracy_score: 0.6666666666666666
