In [2]:
import os
import re
import sys
import numpy as np
from scipy.sparse import hstack
import wordbatch
from wordbatch.models import FTRL
from wordbatch.extractors import WordBag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pandas as pd

In [4]:
df_path = 'data/df_stemmed_kaggle.csv'
df_full = pd.read_csv(df_path)

In [5]:
print("Start Loading the datasets")
df = pd.read_csv('data/df_final_v1.csv')
kaggle_test_df = pd.read_csv('data/kaggle/test.csv')

train_size = df.shape[0]
y = df['label']

test_ids = kaggle_test_df['id']
test_size = kaggle_test_df.shape[0]

Start Loading the datasets


In [6]:
stemmer = SnowballStemmer("english")

# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2)))

def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [8]:
wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams":2, "hash_ngrams_weights":[0.5, -1.0], 
                                             "hash_size":2**23, "norm":'l2', "tf":'log', "idf":50.0})
                       , procs=8
                       , method="serial")


In [9]:
X_title = wb.transform(df_full['stemmed_title'])
X_text = wb.transform(df_full['stemmed_text'])
X_author = df_full['author_cat'].values
X_author = X_author.reshape(-1, 1)

Normalize text
Extract wordbags
Normalize text
Extract wordbags


In [10]:
print(X_title.shape)
print(X_text.shape)
print(X_author.shape)

(59594, 8388608)
(59594, 8388608)
(59594, 1)


In [11]:
sparse_merge = hstack((X_title, X_text, X_author)).tocsr()
sparse_merge.shape

(59594, 16777217)

In [12]:
X = sparse_merge[:train_size]
X_test = sparse_merge[train_size:]

In [14]:
from sklearn.model_selection import train_test_split
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

In [57]:
valid_X[:1].data.shape

(349,)

In [108]:
clf= FTRL(alpha=1, beta=1.0, L1=0.00001, L2=0.8, D=2 ** 25, iters=1)

In [109]:
clf.fit(train_X, train_y)

In [110]:
preds= clf.predict(valid_X)
print(rmsle(valid_y, preds))

0.17462991253007093


In [111]:
np.round(preds[:10])

array([0., 1., 0., 1., 0., 0., 0., 0., 0., 1.])

In [112]:
valid_y[:10]

19307    1
5179     0
37483    0
21443    1
15890    0
43097    0
20292    1
41112    0
35772    0
9417     1
Name: label, dtype: int64

In [113]:
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
print("FTRL dev f1_score:", f1_score(valid_y, np.round(preds)))
print("FTRL dev accuracy_score:", accuracy_score(valid_y, np.round(preds)))
print("FTRL dev recall_score:", recall_score(valid_y, np.round(preds)))
print("FTRL dev precision_score:", precision_score(valid_y, np.round(preds)))

FTRL dev f1_score: 0.8963414634146342
FTRL dev accuracy_score: 0.925
FTRL dev recall_score: 0.8918099089989889
FTRL dev precision_score: 0.9009193054136875


In [114]:
import pickle as pkl
import gzip
print("Saving models")
with gzip.open('./data/models/models_ftrl.pkl', 'wb') as model_file:
    pkl.dump((wb, clf), model_file, protocol=2)

Saving models


In [115]:
df_full.head()

Unnamed: 0,author_cat,stemmed_title,stemmed_text
0,2191,hous dem aide: we didn't even see comey letter...,hous dem aide: we didn't even see comey letter...
1,2114,"flynn: hillari clinton, big woman on campus - ...",ever get the feel your life circl the roundabo...
2,1863,whi the truth might get you fire,"whi the truth might get you fire octob 29, 201..."
3,4372,15 civilian kill in singl us airstrik have bee...,video 15 civilian kill in singl us airstrik ha...
4,3564,iranian woman jail for fiction unpublish stori...,print \r\r\nan iranian woman has been sentenc ...


In [43]:
d = {'title': df_full['stemmed_title'][1],
     'text': df_full['stemmed_text'][1],
     'author': df_full['author_cat'][1]}

In [61]:
X_test_wb = wb.transform([d['title']+' '+d['text']])
X_test_wb.shape

Normalize text
Extract wordbags


(1, 8388608)

In [45]:
X_text = wb.transform([d['text']])
X_text.shape

Normalize text
Extract wordbags


(1, 8388608)

In [46]:
X_author = np.array([2114])
X_author = X_author.reshape(-1, 1)
X_author.shape

(1, 1)

In [47]:
sparse_merge = hstack((X_title, X_text, X_author)).tocsr()
sparse_merge.shape

(1, 16777217)

In [62]:
pred2s= clf.predict(X_test_wb)
pred2s

array([0.99996022])

In [35]:
test_texts2= ['factual and not.']
vec = wb.transform(test_texts2)
vec

Normalize text
Extract wordbags


<1x8388608 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [36]:
pred2s= clf.predict(vec)
pred2s

array([0.98289872])