## Preprocess data

In [1]:
#Load data
import numpy as np
import pandas as pd
seed = 1
train = pd.read_csv('/home/sergey/R_Analyttics_Edge_MITx/kaggle/NYTimesBlogTrain.csv')
test = pd.read_csv('/home/sergey/R_Analyttics_Edge_MITx/kaggle/NYTimesBlogTest.csv')
combo = pd.merge(train, test, how='outer')
combo = combo.drop('Popular', axis=1)
y = train['Popular']
combo_clean = combo.fillna('empty')
combo_clean['LogWC'] = np.log(.1+ combo_clean['WordCount'])

#Extract Date/time info and bin time
import datetime as dt
combo_clean['date'] = combo_clean['PubDate'].apply(
    lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%A')
)
combo_clean['hour'] = combo_clean['PubDate'].apply(
    lambda x: int(dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%H'))
)
combo_clean[['PubDate','date','hour']][1:5]
bins = np.linspace(0,24,5)
labels = ['night', 'morning', 'afternoon', 'evening']
combo_clean['hour_bins'] = pd.cut(combo_clean['hour'], bins=bins, labels=labels)
combo_clean.drop(['hour', 'WordCount', 'PubDate'], axis=1, inplace=True)

#Make dummies and bags of words
import sklearn.feature_extraction.text as txt
categorical_dummies = pd.get_dummies(combo_clean[['NewsDesk','SectionName','SubsectionName','date']])
categorical_hours = pd.get_dummies(combo_clean['hour_bins'].apply(str))


tfidf = txt.TfidfVectorizer(stop_words='english', min_df=10, ngram_range=(1, 2),)


bwAbstract       = (tfidf.fit_transform(combo_clean['Abstract'])).todense()


#Put everything together
data = np.concatenate((categorical_dummies,
                       categorical_hours,
                       combo_clean['LogWC'].to_frame(),
                       bwAbstract), axis=1)

data_train = data[: train.shape[0],:]
data_test  = data[train.shape[0]:,:]
import sklearn.cross_validation as cv
data_train, data_val, y_train, y_val = cv.train_test_split(data_train, y, test_size=.2, random_state=seed)

In [2]:
data_train.shape

(5225, 2531)

## Tune GLM

In [3]:
import sklearn.linear_model as lm
import sklearn.cross_validation as cv
import sklearn.grid_search as grd
import sklearn.metrics as mts
import sklearn.ensemble as ens

In [4]:
lg_mod = lm.LogisticRegression(random_state=seed, class_weight='auto')
lg_mod

LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0)

In [5]:
folds = cv.StratifiedShuffleSplit(y_train, random_state=seed, n_iter=5, test_size=.2)
param = {'C': np.logspace(-2,2.,50),
        'penalty': ['l1', 'l2']}
gr = grd.GridSearchCV(lg_mod, param_grid=param, scoring = 'roc_auc', cv=folds)
gr

GridSearchCV(cv=StratifiedShuffleSplit(labels=[0 0 ..., 1 0], n_iter=5, test_size=0.2, random_state=1),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-02,   1.20679e-02,   1.45635e-02,   1.75751e-02,
         2.12095e-02,   2.55955e-02,   3.08884e-02,   3.72759e-02,
         4.49843e-02,   5.42868e-02,   6.55129e-02,   7.90604e-02,
         9.54095e-02,   1.15140e-01,   1.38950e-01,   1.6...    3.90694e+01,   4.71487e+01,   5.68987e+01,   6.86649e+01,
         8.28643e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=0)

In [6]:
%%timeit -n1 -r1
gr.fit(data_train, y_train)

1 loops, best of 1: 5min 48s per loop


In [7]:
gr.best_estimator_

LogisticRegression(C=1.325711365590108, class_weight='auto', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=1,
          solver='liblinear', tol=0.0001, verbose=0)

In [8]:
gr.best_score_

0.94260202428625106

## Validate LogisticRegression on held-out set

As we had a parameter `refit=True` in GridSearchCV we do not need to refit on the whole train data set.

In [9]:
mts.roc_auc_score(y_val, gr.best_estimator_.predict_proba(data_val)[:,1])

0.93146730206325401

## ExtraTreesClassifier

In [10]:
etc_mod = ens.ExtraTreesClassifier(class_weight='auto', n_jobs=-1, n_estimators=3000)
scores = cv.cross_val_score(etc_mod, data_train, y_train, scoring='roc_auc', cv=folds, n_jobs=-1)

In [11]:
print('Mean AUC is %0.5f \nStandard deviation of AUC is %0.5f' % (scores.mean(), scores.std()))

Mean AUC is 0.93390 
Standard deviation of AUC is 0.00721


In [12]:
etc_mod.fit(data_train, y_train)
print('Mean AUC on held-out set is %0.5f' % mts.roc_auc_score(y_val, etc_mod.predict_proba(data_val)[:,1]))

Mean AUC on held-out set is 0.92983


## AdaBoost

In [13]:
ada_mod = ens.AdaBoostClassifier(random_state=seed)
ada_mod

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=1)

In [14]:
import scipy.stats as st
params = {'n_estimators': [1000, 3000, 10000],
          'learning_rate': st.uniform(loc=.00001,scale= .5)
         }

In [15]:
rgrid_search = grd.RandomizedSearchCV(ada_mod, param_distributions = params,
                                      n_iter=20, scoring='roc_auc',
                                      random_state=seed, cv=folds, n_jobs=-1)
rgrid_search

RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[0 0 ..., 1 0], n_iter=5, test_size=0.2, random_state=1),
          error_score='raise',
          estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=1),
          fit_params={}, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd67114db38>, 'n_estimators': [1000, 3000, 10000]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          scoring='roc_auc', verbose=0)

In [16]:
%%timeit -n1 -r1
rgrid_search.fit(data_train, y_train)

1 loops, best of 1: 21h 45min 30s per loop


In [17]:
rgrid_search.best_score_

0.93347653668208863

In [18]:
rgrid_search.best_estimator_

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.018598112002693002, n_estimators=10000,
          random_state=1)

In [19]:
rgrid_search.best_params_

{'learning_rate': 0.018598112002693002, 'n_estimators': 10000}

In [20]:
print('Mean AUC on held-out set is %0.5f' % 
      mts.roc_auc_score(y_val, rgrid_search.best_estimator_.predict_proba(data_val)[:,1]))

Mean AUC on held-out set is 0.91844
