# Imports and common functions

In [162]:
import pandas as pd
import numpy as np
import string
import json
import glob
import re

from pandas.io.json import json_normalize
from sklearn.metrics import mean_squared_error

# Common functions

In [112]:
def prepareForVWFormat(namespace_name, feature):
    processedFeature = feature
    if isinstance(feature, pd.Series):
        processedFeature = feature[0]

    if not processedFeature:
        processedFeature = '-'
    
    # can't be used in feature or namespace names are vertical bar, colon, space, and newline
    ns_processed = namespace_name.strip().replace(":", "").replace("|", "").replace(",", "")
    f_processed = processedFeature.strip().replace(":", "").replace("|", "").replace(",", "").replace("\n", " ")

    return "|" + ns_processed + " " + f_processed

In [100]:
def processText(text):
    processedText = text
    if isinstance(text, pd.Series):
        processedText = text[0]
        
    if not processedText:
        return '-'
    
    return re.sub(r'[^\w\s]', '', processedText.strip().lower())

In [101]:
def convertDomain(domain):
    processedDomain = domain
    if isinstance(domain, pd.Series):
        processedDomain = domain[0]

    if not processedDomain:
        return '-'
    
    return '0' if re.match("^habr.*", processedDomain) else '1'

In [102]:
def stripHtml(text):
    processedText = text
    if isinstance(text, pd.Series):
        processedText = text[0]
        
    if not processedText:
        return '-'

    return re.sub('<[^>]*>', '', processedText)

In [114]:
def convertToVWFormat(json_obj):
    author_name = processText(json_obj['author.name'])
    author_nickname = processText(json_obj['author.nickname'])
    content = processText(stripHtml(json_obj['content']))
    domain = convertDomain(json_obj['domain'])
    title = processText(stripHtml(json_obj['title']))
    flow = processText(json_obj['flow'])
    
    tags = processText(None if not json_obj['tags'][0] else ' '.join(json_obj['tags'][0]))
    hubs = processText(None if not json_obj['hubs'][0] else ' '.join([el['id'] for el in json_obj['hubs'][0]]))
    
    if not json_obj['published.$date'][0]:
        hour = '-'
        dow = '-'
    else:
        publication_date = pd.to_datetime(json_obj['published.$date'])[0]
        hour = str(publication_date.hour)
        dow = str(publication_date.dayofweek)
    
    
    return str(json_obj['post_id'][0]) + prepareForVWFormat('author_name', author_name) + \
        prepareForVWFormat('author_nickname', author_nickname) + prepareForVWFormat('content', content) + \
        prepareForVWFormat('domain', domain) + prepareForVWFormat('title', title) + \
        prepareForVWFormat('tags', tags) + prepareForVWFormat('hubs', hubs) + \
        prepareForVWFormat('flow', flow) + prepareForVWFormat('hour', hour) + \
        prepareForVWFormat('dow', dow) + "\n"

# Data extraction and analysis

In [115]:
%%time 

f_test_vw = open('../../data/habr_popularity/test.vw', 'w')
for file in glob.glob('../../data/habr_popularity/test/*.json'):
    json_obj = json_normalize(json.loads(open(file).read()))
    f_test_vw.write(convertToVWFormat(json_obj))
    
f_test_vw.close()

CPU times: user 37.6 s, sys: 1.54 s, total: 39.2 s
Wall time: 42.2 s


In [116]:
results = pd.read_csv('../../data/habr_popularity/train_target.csv')

In [135]:
'%2.16f' % (results[results['_id'] == 'https://geektimes.ru/post/8/']['favs_lognorm']).values[0]

'0.6931471805599453'

In [138]:
%%time 

f_train_vw = open('../../data/habr_popularity/train.vw', 'w')
for file in glob.glob('../../data/habr_popularity/train/*.json'):
    json_obj = json_normalize(json.loads(open(file).read()))
    target = '%2.16f' % (results[results['_id'] == json_obj['_id'][0]]['favs_lognorm']).values[0]
    f_train_vw.write(target + ' ' + convertToVWFormat(json_obj))
    
f_train_vw.close()    

CPU times: user 1h 13min 49s, sys: 54.8 s, total: 1h 14min 44s
Wall time: 1h 16min 47s


In [145]:
!wc -l ../../data/habr_popularity/train.vw

172913 ../../data/habr_popularity/train.vw


In [146]:
!split -l138330 ../../data/habr_popularity/train.vw

In [152]:
!ls -al ../../data/habr_popularity/

total 3099656
drwxrwxr-x 4 borowis borowis       4096 May 29 12:00 .
drwxrwxr-x 7 borowis borowis      12288 May 23 23:06 ..
-rw-rw-r-- 1 borowis borowis    2624681 May 29 12:00 model_1_1.vw
drwxr-xr-x 2 borowis borowis     716800 Apr 29 10:11 test
-rw-rw-r-- 1 borowis borowis      86480 May 29 11:47 test_1_1.predictions
-rw-rw-r-- 1 borowis borowis   91924254 May 29 00:13 test.vw
drwxr-xr-x 2 borowis borowis   23236608 Apr 29 10:11 train
-rw-rw-r-- 1 borowis borowis    9340760 May 23 23:08 train_target.csv
-rw-rw-r-- 1 borowis borowis 1522705530 May 29 01:37 train.vw
-rw-rw-r-- 1 borowis borowis     536415 May 29 12:00 valid_1_1.predictions
-rw-rw-r-- 1 borowis borowis 1216306113 May 29 11:57 xaa
-rw-rw-r-- 1 borowis borowis  306399417 May 29 11:57 xab


In [153]:
!cut -d " " -f1 ../../data/habr_popularity/xab > ../../data/habr_popularity/xab_labels

# vw first solution

In [None]:
!vw -c -k --passes 5 --ngram c2 --ngram t2 --hash strings --bootstrap 10 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_1_1.vw

Generating 2-grams for c namespaces.
Generating 2-grams for t namespaces.
final_regressor = ../../data/habr_popularity/model_1_1.vw
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = ../../data/habr_popularity/xaa.cache
Reading datafile = ../../data/habr_popularity/xaa
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.921812 1.921812            1            1.0   1.3863   0.0000      102
1.912189 1.902566            2            2.0   1.3863   0.0070       56
3.362879 4.813568            4            4.0   2.7081   0.1465       74
3.624586 3.886292            8            8.0   0.6931   0.8101      121
7.405031 11.185476           16           16.0   3.9890   0.2537       19
8.612327 9.819623           32           32.0   5.8636   0.8375       59
6.967297 5.322268           64           64.0   4.9767   0.5129  

In [184]:
%%time

!vw --passes 1 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_1_1.vw
!vw --passes 1 --ngram c2 --ngram t2 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_1_2.vw
!vw --passes 1 --ngram c3 --ngram t3 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_1_3.vw

!vw -c --cache_file ../../data/habr_popularity/3_1_cache --passes 3 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_3_1.vw
!vw -c --cache_file ../../data/habr_popularity/3_2_cache --passes 3 --ngram c2 --ngram t2 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_3_2.vw
!vw -c --cache_file ../../data/habr_popularity/3_3_cache --passes 3 --ngram c3 --ngram t3 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_3_3.vw

!vw -c --cache_file ../../data/habr_popularity/5_1_cache --passes 5 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_5_1.vw
!vw -c --cache_file ../../data/habr_popularity/5_2_cache --passes 5 --ngram c2 --ngram t2 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_5_2.vw
!vw -c --cache_file ../../data/habr_popularity/5_3_cache --passes 5 --ngram c3 --ngram t3 -d ../../data/habr_popularity/xaa -b 28 -f ../../data/habr_popularity/model_5_3.vw

final_regressor = ../../data/habr_popularity/model_1_1.vw
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../../data/habr_popularity/xaa
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.921812 1.921812            1            1.0   1.3863   0.0000       53
1.886483 1.851155            2            2.0   1.3863   0.0257       30
3.139852 4.393220            4            4.0   2.7081   0.2815       39
3.207326 3.274800            8            8.0   0.6931   1.5959       63
5.181009 7.154692           16           16.0   4.6913   0.3193        5
6.367646 7.554283           32           32.0   0.0000   2.7423       62
5.511507 4.655369           64           64.0   1.0986   0.9815        5
5.245319 4.979130          128          128.0   4.9273   1.8337       43
4.539271 3.833224          256          256.0   3.1781   2.539

1.921812 1.921812            1            1.0   1.3863   0.0000      102
1.902919 1.884025            2            2.0   1.3863   0.0137       56
3.318742 4.734565            4            4.0   2.7081   0.1703       74
3.499175 3.679609            8            8.0   0.6931   1.0398      121
6.820184 10.141192           16           16.0   3.9890   0.3056       19
8.021171 9.222159           32           32.0   5.8636   1.0432       59
6.496122 4.971073           64           64.0   4.9767   0.5909        6
5.846296 5.196471          128          128.0   4.2195   2.3418       86
5.013123 4.179949          256          256.0   0.6931   0.9101       21
4.367544 3.721965          512          512.0   1.6094   1.1091        5
3.956147 3.544749         1024         1024.0   4.3307   1.1124        4
3.476588 2.997030         2048         2048.0   4.4308   3.1637      107
2.993243 2.509899         4096         4096.0   3.8501   3.0832       13
2.654794 2.316345         8192         8192.0   2.


finished run
number of examples per pass = 248994
passes used = 5
weighted example sum = 1244970.000000
weighted label sum = 3485835.497545
average loss = 1.681011 h
best constant = 2.799935
total feature number = 91555700
Generating 3-grams for c namespaces.
Generating 3-grams for t namespaces.
final_regressor = ../../data/habr_popularity/model_5_3.vw
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
using cache_file = ../../data/habr_popularity/5_3_cache
using cache_file = ../../data/habr_popularity/xaa.cache
ignoring text input in favor of cache input
num sources = 2
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.921812 1.921812            1            1.0   1.3863   0.0000      150
1.908886 1.895961            2            2.0   1.3863   0.0094       81
3.392680 4.876474            4            4.0   2.7081   0.1250      108
3.649840 3.907

In [278]:
%%time
!vw -t -i ../../data/habr_popularity/model_1_1.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_1_1.predictions
!vw -t -i ../../data/habr_popularity/model_1_2.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_1_2.predictions
!vw -t -i ../../data/habr_popularity/model_1_3.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_1_3.predictions

!vw -t -i ../../data/habr_popularity/model_3_1.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_3_1.predictions
!vw -t -i ../../data/habr_popularity/model_3_2.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_3_2.predictions
!vw -t -i ../../data/habr_popularity/model_3_3.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_3_3.predictions

!vw -t -i ../../data/habr_popularity/model_5_1.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_5_1.predictions
!vw -t -i ../../data/habr_popularity/model_5_2.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_5_2.predictions
!vw -t -i ../../data/habr_popularity/model_5_3.vw -d ../../data/habr_popularity/xab -p ../../data/habr_popularity/model_5_3.predictions

only testing
predictions = ../../data/habr_popularity/model_1_1.predictions
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../../data/habr_popularity/xab
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.344693 0.344693            1            1.0   2.5649   3.1521       50
0.634031 0.923370            2            2.0   4.1744   3.2135       26
0.465276 0.296520            4            4.0   1.3863   1.7583       18
0.846801 1.228326            8            8.0   0.0000   0.7849       54
0.877162 0.907524           16           16.0   2.5649   2.7634       19
1.076894 1.276625           32           32.0   0.0000   0.0000       80
1.592764 2.108635           64           64.0   1.7918   4.4855       69
1.432847 1.272929          128          128.0   3.5553   3.1672       40
1.622036 1.811225          256          256.

1.736068 1.823974         2048         2048.0   2.7081   1.1317       17
1.713270 1.690472         4096         4096.0   0.0000   2.3496       70
1.706093 1.698916         8192         8192.0   3.2581   3.5663        7
1.714074 1.722054        16384        16384.0   3.5264   3.1302       41
1.708363 1.702652        32768        32768.0   4.4886   4.0521       11

finished run
number of examples per pass = 34583
passes used = 1
weighted example sum = 34583.000000
weighted label sum = 96818.099835
average loss = 1.707579
best constant = 2.799587
total feature number = 2543917
Generating 3-grams for c namespaces.
Generating 3-grams for t namespaces.
only testing
predictions = ../../data/habr_popularity/model_3_3.predictions
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../../data/habr_popularity/xab
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight 

In [281]:
valid_labels = np.loadtxt('../../data/habr_popularity/xab_labels')
model_predictions = ['model_1_1.predictions', 'model_1_2.predictions', 'model_1_3.predictions', \
                     'model_3_1.predictions', 'model_3_2.predictions', 'model_3_3.predictions', \
                     'model_5_1.predictions', 'model_5_2.predictions', 'model_5_3.predictions']

for model in model_predictions:
    vw_pred = np.loadtxt('../../data/habr_popularity/' + model)[:,0]
    print (model + ': ' + "{:2.16f}".format(mean_squared_error(valid_labels, vw_pred)))

model_1_1.predictions: 1.6805911058338114
model_1_2.predictions: 1.7123877136306151
model_1_3.predictions: 1.7567544967336086
model_3_1.predictions: 1.6629233797387080
model_3_2.predictions: 1.7075790863386933
model_3_3.predictions: 1.7321443497222855
model_5_1.predictions: 1.6692574091059014
model_5_2.predictions: 1.7003249636364874
model_5_3.predictions: 1.7386155290809329


In [154]:
vw_true_valid = np.loadtxt('../../data/habr_popularity/xab_labels')
vw_valid_pred = np.loadtxt('../../data/habr_popularity/valid_1_1.predictions')

In [165]:
mean_squared_error(vw_true_valid, vw_valid_pred[:, 0])

1.6803230621826402

In [282]:
!vw -t --loss_function squared -i ../../data/habr_popularity/model_1_1.vw -d ../../data/habr_popularity/test.vw -p ../../data/habr_popularity/test_1_1.predictions

only testing
predictions = ../../data/habr_popularity/test_1_1.predictions
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../../data/habr_popularity/test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0  unknown   2.7262       45
0.000000 0.000000            2            2.0  unknown   5.0520       19
0.000000 0.000000            4            4.0  unknown   2.9245       20
0.000000 0.000000            8            8.0  unknown   3.6799      102
0.000000 0.000000           16           16.0  unknown   2.6853       64
0.000000 0.000000           32           32.0  unknown   3.1512       63
0.000000 0.000000           64           64.0  unknown   5.8371       36
0.000000 0.000000          128          128.0  unknown   4.5595      122
0.000000 0.000000          256          2

# Save solution

In [283]:
vw_pred = np.loadtxt('../../data/habr_popularity/test_1_1.predictions')[:, 0]

In [173]:
vw_pred.shape

(5405,)

In [284]:
sample_submission = pd.read_csv('sample_submission.csv', index_col='_id')
sample_submission.shape

(5405, 1)

In [285]:
sample_submission['favs_lognorm'] = vw_pred
sample_submission.to_csv('baseline.csv')