In [46]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline, TransformerMixin, FeatureUnion
from sklearn.base import BaseEstimator

from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

In [47]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

In [48]:
data_train = pd.read_csv('fr_learn.tsv', sep='\t')
data_test = pd.read_csv('fr_test.tsv', sep='\t')
data = pd.concat((data_train, data_test), sort=False)
data['train'] = data.fresh_click.notnull()
data['timestamp'] = pd.to_datetime(data.timestamp, unit='s')
data['day'] = data.timestamp.dt.day
data['hour'] = data.timestamp.dt.hour
data['minute'] = data.timestamp.dt.minute

In [49]:
pipeline_without_text  = Pipeline([
    ('columns', ColumnSelector(['requests_per_prev_1_hour', 'requests_per_prev_2_hour',
                                'requests_per_prev_6_hour', 'requests_per_prev_12_hour', 
                                'requests_per_prev_24_hour', 'requests_per_prev_72_hour', 
                                'hour', 'minute'])),
    ('clf', LGBMClassifier(n_estimators=100, class_weight='balanced'))
])

X_train = data[data.day < 29]
y_train = data[data.day < 29].fresh_click
X_test = data[data.day == 29].dropna()
y_test = data[data.day == 29].dropna().fresh_click

pipeline_without_text.fit(X_train, y_train)
predicted = pipeline_without_text.predict(X_test)
metrics.f1_score(y_test, predicted)

0.16634100756095488

In [50]:
pipeline_with_tfidf  = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', 
          Pipeline([
                ('selector', ColumnSelector('query')),
                ('tfidf', TfidfVectorizer(token_pattern=r'\b\d+\b')),
            ])),
        ('columns', ColumnSelector(['requests_per_prev_1_hour', 'requests_per_prev_2_hour',
                                    'requests_per_prev_6_hour', 'requests_per_prev_12_hour', 
                                    'requests_per_prev_24_hour', 'requests_per_prev_72_hour', 
                                    'hour', 'minute']))])),
    ('clf', LGBMClassifier(n_estimators=100, class_weight='balanced'))
])

pipeline_with_tfidf.fit(X_train, y_train)
predicted = pipeline_with_tfidf.predict(X_test)
metrics.f1_score(y_test, predicted)

0.28656748253612757

In [51]:
train = data[data['train']]
test = data[~data['train']]

pipeline_with_tfidf.fit(train, train.fresh_click)
predicted = pipeline_with_tfidf.predict(test)

In [52]:
predicted = predicted.astype(int)

In [53]:
fout = open('detector.tsv', 'w')
for p in predicted:
    fout.write(str(p)+'\n')
fout.close()