In [1]:
import numpy as np
import pandas as pd
seed = 1
train = pd.read_csv('/home/sergey/R_Analyttics_Edge_MITx/kaggle/NYTimesBlogTrain.csv')
test = pd.read_csv('/home/sergey/R_Analyttics_Edge_MITx/kaggle/NYTimesBlogTest.csv')


combo = pd.merge(train, test, how='outer')
combo = combo.drop('Popular', axis=1)
y = train['Popular']
combo_clean = combo.fillna('empty')
combo_clean['LogWC'] = np.log(.1+ combo_clean['WordCount'])


import datetime as dt
combo_clean['date'] = combo_clean['PubDate'].apply(
    lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%A')
)
combo_clean['hour'] = combo_clean['PubDate'].apply(
    lambda x: int(dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%H'))
)
combo_clean[['PubDate','date','hour']][1:5]
bins = np.linspace(0,24,5)
labels = ['night', 'morning', 'afternoon', 'evening']
combo_clean['hour_bins'] = pd.cut(combo_clean['hour'], bins=bins, labels=labels)
combo_clean.drop(['hour', 'WordCount', 'PubDate'], axis=1, inplace=True)


import sklearn.feature_extraction.text as txt
categorical_dummies = pd.get_dummies(combo_clean[['NewsDesk','SectionName','SubsectionName','date']])
categorical_hours = pd.get_dummies(combo_clean['hour_bins'].apply(str))


import nltk

def extractEntities(text):
    sentences= nltk.sent_tokenize(text)
    entities= {}
    for sent in sentences:
        chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)))
        for chunk in chunks:
            if type(chunk) != tuple:
                txt = str(chunk)
                txt = "_".join(" ".join(txt.split('/')).split()[1::2])
                entities[txt] = 1
    return " ".join(dict(entities).keys())


entities = combo_clean['Abstract'].apply(extractEntities)


import sklearn.feature_extraction.text as txt


countVec = txt.CountVectorizer(min_df=2, token_pattern=r'\w{1,}', stop_words='english',
                               strip_accents='unicode', lowercase=True)
boe = countVec.fit_transform(entities)

In [2]:
headlineCount = combo_clean.Headline.map(combo_clean.Headline.value_counts())
data = np.concatenate((categorical_dummies,
                       categorical_hours,
                       boe.todense(),
                       combo_clean['LogWC'].to_frame(),
                       headlineCount.to_frame()), axis=1)


data_train = data[: train.shape[0],:]
data_test  = data[train.shape[0]:,:]
import sklearn.cross_validation as cv
data_train, data_val, y_train, y_val = cv.train_test_split(data_train, y, test_size=.2, random_state=seed)

## Tune GradientBoostingClassifier

In [3]:
import sklearn.grid_search as grd
import sklearn.ensemble as ens
import sklearn.metrics as mts
import sklearn.cross_validation as cv

In [4]:
import scipy.stats as st

In [5]:
folds = cv.StratifiedShuffleSplit(y_train, random_state=seed, n_iter=3, test_size=.2)


# A tree with max_leaf_nodes=k has k - 1 split nodes 
# and thus can model interactions of up to order max_leaf_nodes - 1
# max_leaf_nodes=k gives comparable results to max_depth=k-1 
# but is significantly faster to train at the expense of a slightly higher training error
# http://scikit-learn.org/stable/modules/ensemble.html

# 'max_features', default for classification sqrt of number of features


params = {'max_leaf_nodes':st.randint(3,11),                # add 10 n_iter for every line
          'min_samples_leaf': st.randint(3,11),             # add 10 n_iter for every line
          'max_features': st.uniform(loc=.1,scale= .5)      # add 10 n_iter for every line
         }

In [6]:
clf_gbm = ens.GradientBoostingClassifier(random_state=seed, n_estimators=100)
rgrid_search = grd.RandomizedSearchCV(clf_gbm, param_distributions = params,
                                      n_iter=30, scoring='roc_auc',   # those add up to 30
                                      random_state=seed, cv=folds, n_jobs=-1)
rgrid_search

RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[0 0 ..., 1 0], n_iter=3, test_size=0.2, random_state=1),
          error_score='raise',
          estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=1, subsample=1.0, verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=30, n_jobs=-1,
          param_distributions={'max_leaf_nodes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc3ca63aac8>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc3ca63acf8>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc3ca63aef0>},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          scoring='roc_auc', verbose=0)

In [7]:
%%timeit -n1 -r1
rgrid_search.fit(data_train, y_train)

1 loops, best of 1: 34min 54s per loop


In [8]:
rgrid_search.best_score_

0.94800292886487036

In [9]:
rgrid_search.best_estimator_

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=0.42495436862236735,
              max_leaf_nodes=10, min_samples_leaf=7, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=1, subsample=1.0, verbose=0, warm_start=False)

In [10]:
rgrid_search.best_params_

{'max_features': 0.42495436862236735,
 'max_leaf_nodes': 10,
 'min_samples_leaf': 7}

In [11]:
import sklearn.metrics as mts
mts.roc_auc_score(y_val, rgrid_search.best_estimator_.predict_proba(data_val)[:,1])

0.93651388591418572

In [12]:
best_params = rgrid_search.best_params_
best_params

{'max_features': 0.42495436862236735,
 'max_leaf_nodes': 10,
 'min_samples_leaf': 7}

## Tune learning rate for best model

In [13]:
clf_gbm2 = ens.GradientBoostingClassifier(random_state=seed,
                                          max_leaf_nodes= best_params['max_leaf_nodes'],
                                          max_features= best_params['max_features'],
                                          min_samples_leaf = best_params['min_samples_leaf'],
                                          n_estimators=1000)


# Lower learning rate decreases overfitting
# Learning rate and n_estimators are interconnected
# Decreasing learnig rate by a factor of 2
# needs increasinf n_estimators by a factor of 10
params = {'learning_rate': [.1,.07,.05,.03]}

grid_search2 = grd.GridSearchCV(clf_gbm2, param_grid = params,
                                scoring='roc_auc', cv=folds, n_jobs=-1)

grid_search2

GridSearchCV(cv=StratifiedShuffleSplit(labels=[0 0 ..., 1 0], n_iter=3, test_size=0.2, random_state=1),
       error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=0.42495436862236735,
              max_leaf_nodes=10, min_samples_leaf=7, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              random_state=1, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.07, 0.05, 0.03]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=0)

In [14]:
%%timeit -n1 -r1
grid_search2.fit(data_train, y_train)

1 loops, best of 1: 1h 5min 52s per loop


In [15]:
grid_search2.grid_scores_

[mean: 0.94159, std: 0.00193, params: {'learning_rate': 0.1},
 mean: 0.94364, std: 0.00179, params: {'learning_rate': 0.07},
 mean: 0.94574, std: 0.00240, params: {'learning_rate': 0.05},
 mean: 0.94654, std: 0.00350, params: {'learning_rate': 0.03}]

In [16]:
grid_search2.best_score_

0.9465352565348677

In [17]:
grid_search2.best_params_

{'learning_rate': 0.03}

In [18]:
grid_search2.best_estimator_

GradientBoostingClassifier(init=None, learning_rate=0.03, loss='deviance',
              max_depth=3, max_features=0.42495436862236735,
              max_leaf_nodes=10, min_samples_leaf=7, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              random_state=1, subsample=1.0, verbose=0, warm_start=False)

## Validate best GBM on held-out set

In [19]:
mts.roc_auc_score(y_val, grid_search2.best_estimator_.predict_proba(data_val)[:,1])

0.93893231955450851