# Kaggle competition [How good is your medium article ?](https://www.kaggle.com/c/how-good-is-your-medium-article)

#### This notebook requires at least 17GB of RAM

Download data [from here](https://www.kaggle.com/c/how-good-is-your-medium-article/data)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge
from html.parser import HTMLParser
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from IPython.display import HTML
import base64
from bs4 import BeautifulSoup

from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
stemmer = PorterStemmer()

In [None]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [None]:
import json
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        # Find the offending character index:
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

def write_line(file, line):
    file.write(line)
    file.write('\n')

def extract_features_and_write(path_to_data,
                               inp_filename, is_train=True):
    
    c = []
    p = []
    t = []
    a = []
    features = ['content', 'published', 'title', 'author']
    prefix = 'train' if is_train else 'test'
    feature_files = [open(os.path.join(".",
                                       '{}_{}.txt'.format(prefix, feat)),
                          'w', encoding='utf-8')
                     for feat in features]
    
    with open(os.path.join(path_to_data, inp_filename), 
              encoding='utf-8') as inp_json_file:

        for line in inp_json_file:
            json_data = read_json_line(line)
            return json_data
            
            content = strip_tags(json_data['content'].replace('\n', ' ').replace('\r', ' '))
            content = stemmer.stem(content)
            c.append(content)
            p.append(json_data['published']['$date'])
            t.append(json_data['title'])
            a.append(json_data['author']['url'])
                            
    return c, p, t, a

In [None]:
from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm_notebook
import re
     
def process_line(line):
    json_data = read_json_line(line)
    content = json_data['content'].replace('\n', ' ').replace('\r', ' ')
    content_no_html_tags = stemmer.stem(strip_tags(content))
    published = json_data['published']['$date']
    title = json_data['meta_tags']['title'].split('\u2013')[0].strip() #'Medium Terms of Service – Medium Policy – Medium'
    author = json_data['author']['url']
    domain = json_data['domain']

    tags_str = []
    soup = BeautifulSoup(content, 'lxml')
    try:
        tag_block = soup.find('ul', class_='tags')
        tags = tag_block.find_all('a')
        for tag in tags:
            tags_str.append(tag.text.translate({ord(' '):None, ord('-'):None}))
        tags = ' '.join(tags_str)
    except Exception:
        tags = 'None'
    return content_no_html_tags, published, title, author, domain, tags
 
def extract_features(path_to_data):
    
    content_list = [] 
    published_list = [] 
    title_list = []
    author_list = []
    domain_list = []
    tags_list = []

    with open(path_to_data, encoding='utf-8') as inp_json_file:
        num_cores = multiprocessing.cpu_count()
        results = Parallel(n_jobs=num_cores)(delayed(process_line)(line) for line in tqdm_notebook(inp_json_file))
    return zip(*results) #content_list, published_list, title_list, author_list, domain_list, tags_list, url_list


In [None]:
PATH_TO_DATA = '../input'

In [None]:
%%time
c, t, p, a, d, tag = extract_features(PATH_TO_DATA + '/train.json')

In [None]:
train_df = pd.DataFrame()
train_df['published'] = pd.to_datetime(t)
train_df['title'] = p
train_df['author'] = a
train_df['domain'] = d
train_df['tags'] = tag

In [None]:
%%time
c_test, t, p, a, d, tag = extract_features(PATH_TO_DATA + '/test.json')

In [None]:
test_df = pd.DataFrame()
# test_df['content'] = c
test_df['published'] = pd.to_datetime(t)
test_df['title'] = p
test_df['author'] = a
test_df['domain'] = d
test_df['tags'] = tag

In [None]:
train_df['count_by_author'] = train_df.groupby('author').transform('count')['title']
test_df['count_by_author'] = test_df.groupby('author').transform('count')['title']
count_scaler = StandardScaler()
train_df['count_by_author'] = count_scaler.fit_transform(train_df['count_by_author'].values.reshape(-1, 1))
test_df['count_by_author'] = count_scaler.transform(test_df['count_by_author'].values.reshape(-1, 1))

In [None]:
del t
del p
del a
del d
del tag

In [None]:
%%time
cv = CountVectorizer()
authors_train_sparse = cv.fit_transform(train_df.author.values)
authors_test_sparse = cv.transform(test_df.author.values)

In [None]:
%%time
contetnt_tf = TfidfVectorizer(ngram_range=(1, 2), max_features=100000)
content_sparse_train = contetnt_tf.fit_transform(c)
content_sparse_test = contetnt_tf.transform(c_test)

In [None]:
%%time
titles_tf = TfidfVectorizer(ngram_range=(1, 2), max_features=100000)
titles_sparse_train = titles_tf.fit_transform(train_df['title'])
titles_sparse_test = titles_tf.transform(test_df['title'])

In [None]:
%%time
domain_cv = CountVectorizer()
domains_sparse_train = domain_cv.fit_transform(train_df['domain'])
domains_sparse_test = domain_cv.transform(test_df['domain'])

In [None]:
%%time
tags_cv = CountVectorizer()
tags_sparse_train = tags_cv.fit_transform(train_df['tags'])
tags_sparse_test = tags_cv.transform(test_df['tags'])

In [None]:
content_length = np.array([len(con) for con in c])
content_length_test = np.array([len(con) for con in c_test])

In [None]:
def add_length_features(X, length):
    length_s = pd.Series(length)
    short = (length_s < 1000).astype('int32').values.reshape(-1, 1)
    medium = ((length_s > 1000) & (length_s < 5000)).astype('int').values.reshape(-1, 1)
    medium_large = ((length_s >= 5000) & (length_s < 10000)).astype('int').values.reshape(-1, 1)
    large = (length_s >= 10000).astype('int').values.reshape(-1, 1)
    
    return hstack([X, short, medium, medium_large, large]).tocsr()

In [None]:
%%time
X_train_sparse = hstack([authors_train_sparse, content_sparse_train, titles_sparse_train, domains_sparse_train, tags_sparse_train]).tocsr()
X_test_sparse = hstack([authors_test_sparse, content_sparse_test, titles_sparse_test, domains_sparse_test, tags_sparse_test]).tocsr()

In [None]:
X_train_sparse = add_length_features(X_train_sparse, content_length)
X_test_sparse = add_length_features(X_test_sparse, content_length_test)

In [None]:
def add_date_features(X_train, X_test, pub, pub_test):
    hour = pub.dt.hour.values
    dow = pub.dt.dayofweek.values
    month = pub.dt.month.values
    year = pub.dt.year.values
    is_day = pub.dt.day.values
    is_weekday = (dow < 5).astype('int').reshape(-1, 1)
    morning = ((hour >= 7) & (hour <= 11)).astype('int').reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').reshape(-1, 1)
    night = ((hour >= 0) & (hour <= 6)).astype('int').reshape(-1, 1)
    
    hour_test = pub_test.dt.hour.values
    dow_test = pub_test.dt.dayofweek.values
    month_test = pub_test.dt.month.values
    year_test = pub_test.dt.year.values
    is_day_test = pub_test.dt.day.values
    is_weekday_test = (dow_test < 5).astype('int').reshape(-1, 1)
    morning_test = ((hour_test >= 7) & (hour_test <= 11)).astype('int').reshape(-1, 1)
    day_test = ((hour_test >= 12) & (hour_test <= 18)).astype('int').reshape(-1, 1)
    evening_test = ((hour_test >= 19) & (hour_test <= 23)).astype('int').reshape(-1, 1)
    night_test = ((hour_test >= 0) & (hour_test <= 6)).astype('int').reshape(-1, 1)
    
    sc = StandardScaler()
    hour = sc.fit_transform(hour.reshape(-1, 1))
    hour_test = sc.transform(hour_test.reshape(-1, 1))
    
    dow = sc.fit_transform(dow.reshape(-1, 1))
    dow_test = sc.transform(dow_test.reshape(-1, 1))
    
    month = sc.fit_transform(month.reshape(-1, 1))
    month_test = sc.transform(month_test.reshape(-1, 1))
    
    year = sc.fit_transform(year.reshape(-1, 1))
    year_test = sc.transform(year_test.reshape(-1, 1))
    
    
    is_day = sc.fit_transform(is_day.reshape(-1, 1))
    is_day_test = sc.transform(is_day_test.reshape(-1, 1))
    
    return hstack([X_train, hour, dow, month, year, morning, day, evening, night, is_day, is_weekday]).tocsr(),\
           hstack([X_test, hour_test, dow_test, month_test, year_test, morning_test, day_test, evening_test, night_test, is_day_test, is_weekday_test]).tocsr()

In [None]:
%%time
X_train, X_test = add_date_features(X_train_sparse, X_test_sparse, train_df['published'], test_df['published'])

In [None]:
X_train.shape, X_test.shape

In [None]:
from scipy.sparse import save_npz

In [None]:
save_npz('train_domains_tags_author_fixed.npz', X_train)
save_npz('test_domains_tags_author_fixed.npz', X_test)

In [None]:
train_df.to_feather('train_domains_tags_author_fixed')
test_df.to_feather('test_domains_tags_author_fixed')

In [None]:
!ls -lh

In [None]:
import gc

In [None]:
del c
del c_test
del content_length
del content_length_test
gc.collect()

In [None]:
del X_train_sparse
del X_test_sparse

In [None]:
del content_sparse_train
del content_sparse_test
del titles_sparse_train
del titles_sparse_test
del authors_train_sparse
del authors_test_sparse
gc.collect()

In [None]:
X_train = hstack([X_train, train_df['count_by_author'].values.reshape(-1, 1)]).tocsr()
X_test = hstack([X_test, test_df['count_by_author'].values.reshape(-1, 1)]).tocsr()

In [None]:
del train_df
del test_df

In [None]:
del domains_sparse_train
del domains_sparse_test
del tags_sparse_train
del tags_sparse_test

In [None]:
gc.collect()

# LightGBM

In [None]:
import lightgbm as lgb

In [None]:
train_target = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_log1p_recommends.csv'), 
                           index_col='id')
y_train = train_target['log_recommends'].values

In [None]:
train_part_size = int(0.7 * train_target.shape[0])
X_train_part = X_train[:train_part_size, :]
y_train_part = y_train[:train_part_size]
X_valid =  X_train[train_part_size:, :]
y_valid = y_train[train_part_size:]

In [None]:
lgb_x_train_part = lgb.Dataset(X_train_part.astype(np.float32), label=y_train_part)
lgb_x_valid = lgb.Dataset(X_valid.astype(np.float32), label=y_valid)

In [None]:
param = {'num_leaves': 31, 'num_trees': 200, 'objective': 'mean_absolute_error',
        'metric': 'mae'}

In [None]:
%%time
num_round = 200
bst_lgb = lgb.train(param, lgb_x_train_part, num_round, valid_sets=[lgb_x_valid], early_stopping_rounds=20)

In [None]:
bst_lgb.save_model('gradient_200_a_fixed')

In [None]:
!ls -lh

In [None]:
lgb_pred = bst_lgb.predict(X_valid.astype(np.float32), num_iteration=bst_lgb.best_iteration)

In [None]:
lgb_valid_mae = mean_absolute_error(y_valid, lgb_pred)
lgb_valid_mae, np.expm1(lgb_valid_mae)

In [None]:
ridge = Ridge(random_state=17, alpha=0.5)

In [None]:
params = {'alpha' : np.linspace(0.5, 1.5, 5)}

In [None]:
%%time
aphas = np.linspace(0.5, 1.5, 5)
ridges = [Ridge(alpha = aphas[i], random_state=17) for i in range(len(aphas))]

for i in range(len(aphas)):
    alpha = aphas[i]
    model = ridges[i]
    model.fit(X_train_part, y_train_part)
    ridge_pred = model.predict(X_valid)
    valid_mae = mean_absolute_error(y_valid, ridge_pred)
    print(valid_mae, np.expm1(valid_mae))

In [None]:
np.linspace(0.5, 1.5, 5)

In [None]:
%%time
ridge = Ridge(alpha=1, random_state=17)
ridge.fit(X_train, y_train);

In [None]:
ridge_pred = ridge.predict(X_valid)

In [None]:
valid_mae = mean_absolute_error(y_valid, ridge_pred)
valid_mae, np.expm1(valid_mae)

In [None]:
ridge_pred.shape

In [None]:
np.linspace(.7, .8, 10, endpoint=False)

In [None]:
for beta in [.1, .2, .3, .4, .5, .6, .7, .8, .9]:
    overall_preds = beta * ridge_pred + (1 - beta) * lgb_pred
    valid_mae = mean_absolute_error(y_valid, overall_preds)
    print(valid_mae, np.expm1(valid_mae))

In [None]:
%%time
ridge_test_pred = ridge.predict(X_test)
lgb_test_pred = bst_lgb.predict(X_test.astype(np.float32), num_iteration=bst_lgb.best_iteration)

In [None]:
overall_pred = .55 * ridge_test_pred + .45 * lgb_test_pred

In [None]:
mean_add = overall_pred + (4.33328 - overall_pred.mean())

In [None]:
def write_submission_file(prediction, filename,
    path_to_sample=os.path.join(PATH_TO_DATA, 'sample_submission.csv')):
    submission = pd.read_csv(path_to_sample, index_col='id')
    
    submission['log_recommends'] = prediction
    submission.to_csv(filename)

In [None]:
write_submission_file(prediction=ridge_test_pred, 
                      filename='ridge.csv')
write_submission_file(prediction=lgb_test_pred, 
                      filename='lightgbm.csv')

In [None]:
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
path_to_sample=os.path.join(PATH_TO_DATA, 'sample_submission.csv')
submission = pd.read_csv(path_to_sample, index_col='id')
submission['log_recommends'] = lgb_test_pred
create_download_link(submission, filename='gradient_200_fixed_author.csv')